rework yappy_parser.py, Added dzen_parser.py, minor fixes
This commit is contained in:
		| @@ -30,21 +30,6 @@ class RedisClient: | ||||
|             res = await connection.hgetall(queue_name) | ||||
|         return res | ||||
|  | ||||
|     # async def get_tasks(self) -> set: | ||||
|     #     async with self.connection as connection: | ||||
|     #         res = await connection.smembers(self.TASKS_NAME + f":1") | ||||
|     #     return res | ||||
|  | ||||
|     # async def get_task_done_queue(self) -> set: | ||||
|     #     async with self.connection as connection: | ||||
|     #         res = await connection.smembers(self.TASKS_DONE_NAME + f":1") | ||||
|     #     return res | ||||
|  | ||||
|     # async def del_task_from_queue(self, link, task: dict) -> int: | ||||
|     #     async with self.connection as connection: | ||||
|     #         res = await self._del_task(self.SET_NAME, json.dumps(task, indent=4).encode('utf-8')) | ||||
|     #     return res | ||||
|  | ||||
|     async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int: | ||||
|         await self._del_task(self.SET_NAME, link) | ||||
|         return await self._set_task(self.TASKS_NAME, link, task) | ||||
|   | ||||
| @@ -4,21 +4,18 @@ from minio.error import S3Error | ||||
|  | ||||
| def main(): | ||||
|     client = Minio( | ||||
|         "s3.grfc.ru:443", | ||||
|         "grfc.ru", | ||||
|         access_key="cl-i-oculus-dev1", | ||||
|         secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM", | ||||
|         secure=True | ||||
|     ) | ||||
|  | ||||
|     # Make 'asiatrip' bucket if not exist. | ||||
|     found = client.bucket_exists("clean-internet-oculus-integration-dev") | ||||
|     if not found: | ||||
|         client.make_bucket("clean-internet-oculus-integration-dev") | ||||
|     else: | ||||
|         print("Bucket 'clean-internet-oculus-integration-dev' already exists") | ||||
|  | ||||
|     # Upload '/home/user/Photos/asiaphotos.zip' as object name | ||||
|     # 'asiaphotos-2015.zip' to bucket 'asiatrip'. | ||||
|     client.fput_object( | ||||
|         "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4", | ||||
|     ) | ||||
|   | ||||
							
								
								
									
										39
									
								
								src/parsers/Dzen/dzen_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								src/parsers/Dzen/dzen_parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | ||||
| import json | ||||
| import os | ||||
| import re | ||||
|  | ||||
| from playwright.sync_api import Playwright | ||||
| from playwright.sync_api import sync_playwright | ||||
|  | ||||
| from src.parsers.base_parser import BaseParser | ||||
|  | ||||
|  | ||||
| class DzenParser(BaseParser): | ||||
|     BASE_DIR = os.path.abspath(f"downloads/Dzen") | ||||
|  | ||||
|     def get_video_link(self, playwright: Playwright): | ||||
|         browser = playwright.chromium.launch(headless=True) | ||||
|         context = browser.new_context() | ||||
|         page = context.new_page() | ||||
|         page.goto(url=self.params["link"], wait_until='domcontentloaded') | ||||
|         link = page.text_content("xpath=//script[contains(text(), 'serverState')]") | ||||
|         links_json_starts_with = re.findall(r'\W\WStreamInfo', link)[0] | ||||
|         links_json_ends_with = re.findall(r'\W\W\W\WsocialInfo', link)[0] | ||||
|         _, _, link = link.partition(links_json_starts_with) | ||||
|         links, _, _ = link.partition(links_json_ends_with) | ||||
|         link = json.loads(links_json_starts_with + links)["StreamInfo"][-1]["OutputStream"] | ||||
|         title = json.loads(links_json_starts_with + links)["Uuid"] | ||||
|         return link, title | ||||
|  | ||||
|     def video_download(self, link: str = None, title: str = None): | ||||
|         with sync_playwright() as playwright: | ||||
|  | ||||
|             if not link and not title: | ||||
|                 link, title = self.get_video_link(playwright) | ||||
|  | ||||
|         base_link = self.params["link"] | ||||
|         self.params["link"] = link | ||||
|         self.params["outtmpl"] = f"downloads/ZenYandex/{title}_%(resolution)s.%(ext)s" | ||||
|         file_path = super().video_download() | ||||
|         self.params["link"] = base_link | ||||
|         return file_path | ||||
| @@ -2,6 +2,8 @@ import os | ||||
| import requests | ||||
|  | ||||
| from bs4 import BeautifulSoup | ||||
| from playwright.sync_api import Playwright | ||||
| from playwright.sync_api import sync_playwright | ||||
|  | ||||
| from src.exceptions.download_exceptions import FileAlreadyExistException | ||||
| from src.parsers.base_parser import BaseParser | ||||
| @@ -9,19 +11,31 @@ from src.parsers.base_parser import BaseParser | ||||
|  | ||||
| class YappyParser(BaseParser): | ||||
|     BASE_DIR = os.path.abspath(f"downloads/Yappy") | ||||
|     # OLD WAY | ||||
|     # def get_video_link(self): | ||||
|     #     resp = requests.get(self.params["link"]) | ||||
|     #     resp.encoding = self.BASE_ENCODING | ||||
|     #     soup = BeautifulSoup(resp.text, 'lxml') | ||||
|     # | ||||
|     #     link = soup.find('video').get("src") | ||||
|     #     title = soup.find('video').get("id") | ||||
|     #     return link, title | ||||
|  | ||||
|     def get_video_link(self): | ||||
|         resp = requests.get(self.params["link"]) | ||||
|         resp.encoding = self.BASE_ENCODING | ||||
|         soup = BeautifulSoup(resp.text, 'lxml') | ||||
|  | ||||
|         link = soup.find('video').get("src") | ||||
|         title = soup.find('video').get("id") | ||||
|     def get_video_link(self, playwright: Playwright): | ||||
|         browser = playwright.chromium.launch(headless=True) | ||||
|         context = browser.new_context() | ||||
|         page = context.new_page() | ||||
|         page.goto(url=self.params["link"], wait_until='domcontentloaded') | ||||
|         link = page.get_attribute("xpath=//video", "src") | ||||
|         title = page.get_attribute("xpath=//video", "id") | ||||
|         return link, title | ||||
|  | ||||
|  | ||||
|     def video_download(self, link: str = None, title: str = None): | ||||
|         with sync_playwright() as playwright: | ||||
|  | ||||
|             if not link and not title: | ||||
|             link, title = self.get_video_link() | ||||
|                 link, title = self.get_video_link(playwright) | ||||
|  | ||||
|         if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")): | ||||
|             raise FileAlreadyExistException(message=f"Yappy/{title}.mp4") | ||||
|   | ||||
| @@ -31,6 +31,8 @@ class BaseParser: | ||||
|             resolution = "NA" | ||||
|         if "Yahoo" in ydl_opts["outtmpl"]["default"]: | ||||
|             path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" | ||||
|         elif "ZenYandex" in ydl_opts["outtmpl"]["default"]: | ||||
|             path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" | ||||
|         else: | ||||
|             path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}" | ||||
|         if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)): | ||||
|   | ||||
| @@ -1,9 +1,11 @@ | ||||
| from collections import OrderedDict | ||||
| import re | ||||
|  | ||||
| from src.parsers.Dzen.dzen_parser import DzenParser | ||||
| from src.parsers.MyMail.my_mail_parser import MyMailParser | ||||
| from src.parsers.Okru.ok_parser import OkParser | ||||
| from src.parsers.Yahoo.yahoo_parser import YahooParser | ||||
| from src.parsers.Yappy.yappy_parser import YappyParser | ||||
| from src.parsers.base_parser import BaseParser | ||||
|  | ||||
|  | ||||
| @@ -19,11 +21,12 @@ parser_mapping = OrderedDict( | ||||
|         compile_regex(r"^ok.ru/okvideo/topic"): OkParser, | ||||
|         compile_regex(r"^ok.ru/video"): BaseParser, | ||||
|         compile_regex(r"^...?likee.video/"): BaseParser, | ||||
|         compile_regex(r"^dzen.ru/"): BaseParser, | ||||
|         compile_regex(r"^yappy.media/"): BaseParser, | ||||
|         compile_regex(r"^dzen.ru/"): DzenParser, | ||||
|         compile_regex(r"^yappy.media/"): YappyParser, | ||||
|         compile_regex(r"^yandex.ru/"): BaseParser, | ||||
|         compile_regex(r"^.*\.yahoo.com/"): YahooParser, | ||||
|         compile_regex(r"^.*\.livejournal.com/"): BaseParser, | ||||
|         compile_regex(r"^.*\.dzen.ru/"): BaseParser, | ||||
|     } | ||||
| ) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 nikili0n
					nikili0n