From 9d852692025b54bbdab64adc21ac0119776f2139 Mon Sep 17 00:00:00 2001 From: Dantenerosas Date: Tue, 10 Oct 2023 03:41:55 +0300 Subject: [PATCH] rework yappy_parser.py, Added dzen_parser.py, minor fixes --- src/core/redis_client.py | 15 ------------ src/core/uploader.py | 5 +--- src/parsers/Dzen/dzen_parser.py | 39 +++++++++++++++++++++++++++++++ src/parsers/Yappy/yappy_parser.py | 32 ++++++++++++++++++------- src/parsers/base_parser.py | 2 ++ src/parsers/parser_mapping.py | 7 ++++-- 6 files changed, 70 insertions(+), 30 deletions(-) create mode 100644 src/parsers/Dzen/dzen_parser.py diff --git a/src/core/redis_client.py b/src/core/redis_client.py index 5a42fcc..c304de8 100644 --- a/src/core/redis_client.py +++ b/src/core/redis_client.py @@ -30,21 +30,6 @@ class RedisClient: res = await connection.hgetall(queue_name) return res - # async def get_tasks(self) -> set: - # async with self.connection as connection: - # res = await connection.smembers(self.TASKS_NAME + f":1") - # return res - - # async def get_task_done_queue(self) -> set: - # async with self.connection as connection: - # res = await connection.smembers(self.TASKS_DONE_NAME + f":1") - # return res - - # async def del_task_from_queue(self, link, task: dict) -> int: - # async with self.connection as connection: - # res = await self._del_task(self.SET_NAME, json.dumps(task, indent=4).encode('utf-8')) - # return res - async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int: await self._del_task(self.SET_NAME, link) return await self._set_task(self.TASKS_NAME, link, task) diff --git a/src/core/uploader.py b/src/core/uploader.py index fe6cedf..fe05fc6 100644 --- a/src/core/uploader.py +++ b/src/core/uploader.py @@ -4,21 +4,18 @@ from minio.error import S3Error def main(): client = Minio( - "s3.grfc.ru:443", + "grfc.ru", access_key="cl-i-oculus-dev1", secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM", secure=True ) - # Make 'asiatrip' bucket if not exist. found = client.bucket_exists("clean-internet-oculus-integration-dev") if not found: client.make_bucket("clean-internet-oculus-integration-dev") else: print("Bucket 'clean-internet-oculus-integration-dev' already exists") - # Upload '/home/user/Photos/asiaphotos.zip' as object name - # 'asiaphotos-2015.zip' to bucket 'asiatrip'. client.fput_object( "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4", ) diff --git a/src/parsers/Dzen/dzen_parser.py b/src/parsers/Dzen/dzen_parser.py new file mode 100644 index 0000000..971960f --- /dev/null +++ b/src/parsers/Dzen/dzen_parser.py @@ -0,0 +1,39 @@ +import json +import os +import re + +from playwright.sync_api import Playwright +from playwright.sync_api import sync_playwright + +from src.parsers.base_parser import BaseParser + + +class DzenParser(BaseParser): + BASE_DIR = os.path.abspath(f"downloads/Dzen") + + def get_video_link(self, playwright: Playwright): + browser = playwright.chromium.launch(headless=True) + context = browser.new_context() + page = context.new_page() + page.goto(url=self.params["link"], wait_until='domcontentloaded') + link = page.text_content("xpath=//script[contains(text(), 'serverState')]") + links_json_starts_with = re.findall(r'\W\WStreamInfo', link)[0] + links_json_ends_with = re.findall(r'\W\W\W\WsocialInfo', link)[0] + _, _, link = link.partition(links_json_starts_with) + links, _, _ = link.partition(links_json_ends_with) + link = json.loads(links_json_starts_with + links)["StreamInfo"][-1]["OutputStream"] + title = json.loads(links_json_starts_with + links)["Uuid"] + return link, title + + def video_download(self, link: str = None, title: str = None): + with sync_playwright() as playwright: + + if not link and not title: + link, title = self.get_video_link(playwright) + + base_link = self.params["link"] + self.params["link"] = link + self.params["outtmpl"] = f"downloads/ZenYandex/{title}_%(resolution)s.%(ext)s" + file_path = super().video_download() + self.params["link"] = base_link + return file_path diff --git a/src/parsers/Yappy/yappy_parser.py b/src/parsers/Yappy/yappy_parser.py index 7135c52..1dc1f2e 100644 --- a/src/parsers/Yappy/yappy_parser.py +++ b/src/parsers/Yappy/yappy_parser.py @@ -2,6 +2,8 @@ import os import requests from bs4 import BeautifulSoup +from playwright.sync_api import Playwright +from playwright.sync_api import sync_playwright from src.exceptions.download_exceptions import FileAlreadyExistException from src.parsers.base_parser import BaseParser @@ -9,19 +11,31 @@ from src.parsers.base_parser import BaseParser class YappyParser(BaseParser): BASE_DIR = os.path.abspath(f"downloads/Yappy") + # OLD WAY + # def get_video_link(self): + # resp = requests.get(self.params["link"]) + # resp.encoding = self.BASE_ENCODING + # soup = BeautifulSoup(resp.text, 'lxml') + # + # link = soup.find('video').get("src") + # title = soup.find('video').get("id") + # return link, title - def get_video_link(self): - resp = requests.get(self.params["link"]) - resp.encoding = self.BASE_ENCODING - soup = BeautifulSoup(resp.text, 'lxml') - - link = soup.find('video').get("src") - title = soup.find('video').get("id") + def get_video_link(self, playwright: Playwright): + browser = playwright.chromium.launch(headless=True) + context = browser.new_context() + page = context.new_page() + page.goto(url=self.params["link"], wait_until='domcontentloaded') + link = page.get_attribute("xpath=//video", "src") + title = page.get_attribute("xpath=//video", "id") return link, title + def video_download(self, link: str = None, title: str = None): - if not link and not title: - link, title = self.get_video_link() + with sync_playwright() as playwright: + + if not link and not title: + link, title = self.get_video_link(playwright) if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")): raise FileAlreadyExistException(message=f"Yappy/{title}.mp4") diff --git a/src/parsers/base_parser.py b/src/parsers/base_parser.py index d1f9aee..87ab363 100644 --- a/src/parsers/base_parser.py +++ b/src/parsers/base_parser.py @@ -31,6 +31,8 @@ class BaseParser: resolution = "NA" if "Yahoo" in ydl_opts["outtmpl"]["default"]: path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" + elif "ZenYandex" in ydl_opts["outtmpl"]["default"]: + path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" else: path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)): diff --git a/src/parsers/parser_mapping.py b/src/parsers/parser_mapping.py index 0c3f9d5..f1e100a 100644 --- a/src/parsers/parser_mapping.py +++ b/src/parsers/parser_mapping.py @@ -1,9 +1,11 @@ from collections import OrderedDict import re +from src.parsers.Dzen.dzen_parser import DzenParser from src.parsers.MyMail.my_mail_parser import MyMailParser from src.parsers.Okru.ok_parser import OkParser from src.parsers.Yahoo.yahoo_parser import YahooParser +from src.parsers.Yappy.yappy_parser import YappyParser from src.parsers.base_parser import BaseParser @@ -19,11 +21,12 @@ parser_mapping = OrderedDict( compile_regex(r"^ok.ru/okvideo/topic"): OkParser, compile_regex(r"^ok.ru/video"): BaseParser, compile_regex(r"^...?likee.video/"): BaseParser, - compile_regex(r"^dzen.ru/"): BaseParser, - compile_regex(r"^yappy.media/"): BaseParser, + compile_regex(r"^dzen.ru/"): DzenParser, + compile_regex(r"^yappy.media/"): YappyParser, compile_regex(r"^yandex.ru/"): BaseParser, compile_regex(r"^.*\.yahoo.com/"): YahooParser, compile_regex(r"^.*\.livejournal.com/"): BaseParser, + compile_regex(r"^.*\.dzen.ru/"): BaseParser, } )