rework yappy_parser.py, Added dzen_parser.py, minor fixes
This commit is contained in:
		@@ -30,21 +30,6 @@ class RedisClient:
 | 
				
			|||||||
            res = await connection.hgetall(queue_name)
 | 
					            res = await connection.hgetall(queue_name)
 | 
				
			||||||
        return res
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # async def get_tasks(self) -> set:
 | 
					 | 
				
			||||||
    #     async with self.connection as connection:
 | 
					 | 
				
			||||||
    #         res = await connection.smembers(self.TASKS_NAME + f":1")
 | 
					 | 
				
			||||||
    #     return res
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # async def get_task_done_queue(self) -> set:
 | 
					 | 
				
			||||||
    #     async with self.connection as connection:
 | 
					 | 
				
			||||||
    #         res = await connection.smembers(self.TASKS_DONE_NAME + f":1")
 | 
					 | 
				
			||||||
    #     return res
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # async def del_task_from_queue(self, link, task: dict) -> int:
 | 
					 | 
				
			||||||
    #     async with self.connection as connection:
 | 
					 | 
				
			||||||
    #         res = await self._del_task(self.SET_NAME, json.dumps(task, indent=4).encode('utf-8'))
 | 
					 | 
				
			||||||
    #     return res
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int:
 | 
					    async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int:
 | 
				
			||||||
        await self._del_task(self.SET_NAME, link)
 | 
					        await self._del_task(self.SET_NAME, link)
 | 
				
			||||||
        return await self._set_task(self.TASKS_NAME, link, task)
 | 
					        return await self._set_task(self.TASKS_NAME, link, task)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -4,21 +4,18 @@ from minio.error import S3Error
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def main():
 | 
					def main():
 | 
				
			||||||
    client = Minio(
 | 
					    client = Minio(
 | 
				
			||||||
        "s3.grfc.ru:443",
 | 
					        "grfc.ru",
 | 
				
			||||||
        access_key="cl-i-oculus-dev1",
 | 
					        access_key="cl-i-oculus-dev1",
 | 
				
			||||||
        secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
 | 
					        secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
 | 
				
			||||||
        secure=True
 | 
					        secure=True
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Make 'asiatrip' bucket if not exist.
 | 
					 | 
				
			||||||
    found = client.bucket_exists("clean-internet-oculus-integration-dev")
 | 
					    found = client.bucket_exists("clean-internet-oculus-integration-dev")
 | 
				
			||||||
    if not found:
 | 
					    if not found:
 | 
				
			||||||
        client.make_bucket("clean-internet-oculus-integration-dev")
 | 
					        client.make_bucket("clean-internet-oculus-integration-dev")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        print("Bucket 'clean-internet-oculus-integration-dev' already exists")
 | 
					        print("Bucket 'clean-internet-oculus-integration-dev' already exists")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Upload '/home/user/Photos/asiaphotos.zip' as object name
 | 
					 | 
				
			||||||
    # 'asiaphotos-2015.zip' to bucket 'asiatrip'.
 | 
					 | 
				
			||||||
    client.fput_object(
 | 
					    client.fput_object(
 | 
				
			||||||
        "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
 | 
					        "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										39
									
								
								src/parsers/Dzen/dzen_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								src/parsers/Dzen/dzen_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,39 @@
 | 
				
			|||||||
 | 
					import json
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from playwright.sync_api import Playwright
 | 
				
			||||||
 | 
					from playwright.sync_api import sync_playwright
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from src.parsers.base_parser import BaseParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class DzenParser(BaseParser):
 | 
				
			||||||
 | 
					    BASE_DIR = os.path.abspath(f"downloads/Dzen")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_video_link(self, playwright: Playwright):
 | 
				
			||||||
 | 
					        browser = playwright.chromium.launch(headless=True)
 | 
				
			||||||
 | 
					        context = browser.new_context()
 | 
				
			||||||
 | 
					        page = context.new_page()
 | 
				
			||||||
 | 
					        page.goto(url=self.params["link"], wait_until='domcontentloaded')
 | 
				
			||||||
 | 
					        link = page.text_content("xpath=//script[contains(text(), 'serverState')]")
 | 
				
			||||||
 | 
					        links_json_starts_with = re.findall(r'\W\WStreamInfo', link)[0]
 | 
				
			||||||
 | 
					        links_json_ends_with = re.findall(r'\W\W\W\WsocialInfo', link)[0]
 | 
				
			||||||
 | 
					        _, _, link = link.partition(links_json_starts_with)
 | 
				
			||||||
 | 
					        links, _, _ = link.partition(links_json_ends_with)
 | 
				
			||||||
 | 
					        link = json.loads(links_json_starts_with + links)["StreamInfo"][-1]["OutputStream"]
 | 
				
			||||||
 | 
					        title = json.loads(links_json_starts_with + links)["Uuid"]
 | 
				
			||||||
 | 
					        return link, title
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def video_download(self, link: str = None, title: str = None):
 | 
				
			||||||
 | 
					        with sync_playwright() as playwright:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if not link and not title:
 | 
				
			||||||
 | 
					                link, title = self.get_video_link(playwright)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        base_link = self.params["link"]
 | 
				
			||||||
 | 
					        self.params["link"] = link
 | 
				
			||||||
 | 
					        self.params["outtmpl"] = f"downloads/ZenYandex/{title}_%(resolution)s.%(ext)s"
 | 
				
			||||||
 | 
					        file_path = super().video_download()
 | 
				
			||||||
 | 
					        self.params["link"] = base_link
 | 
				
			||||||
 | 
					        return file_path
 | 
				
			||||||
@@ -2,6 +2,8 @@ import os
 | 
				
			|||||||
import requests
 | 
					import requests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from bs4 import BeautifulSoup
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					from playwright.sync_api import Playwright
 | 
				
			||||||
 | 
					from playwright.sync_api import sync_playwright
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from src.exceptions.download_exceptions import FileAlreadyExistException
 | 
					from src.exceptions.download_exceptions import FileAlreadyExistException
 | 
				
			||||||
from src.parsers.base_parser import BaseParser
 | 
					from src.parsers.base_parser import BaseParser
 | 
				
			||||||
@@ -9,19 +11,31 @@ from src.parsers.base_parser import BaseParser
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class YappyParser(BaseParser):
 | 
					class YappyParser(BaseParser):
 | 
				
			||||||
    BASE_DIR = os.path.abspath(f"downloads/Yappy")
 | 
					    BASE_DIR = os.path.abspath(f"downloads/Yappy")
 | 
				
			||||||
 | 
					    # OLD WAY
 | 
				
			||||||
 | 
					    # def get_video_link(self):
 | 
				
			||||||
 | 
					    #     resp = requests.get(self.params["link"])
 | 
				
			||||||
 | 
					    #     resp.encoding = self.BASE_ENCODING
 | 
				
			||||||
 | 
					    #     soup = BeautifulSoup(resp.text, 'lxml')
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #     link = soup.find('video').get("src")
 | 
				
			||||||
 | 
					    #     title = soup.find('video').get("id")
 | 
				
			||||||
 | 
					    #     return link, title
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_video_link(self):
 | 
					    def get_video_link(self, playwright: Playwright):
 | 
				
			||||||
        resp = requests.get(self.params["link"])
 | 
					        browser = playwright.chromium.launch(headless=True)
 | 
				
			||||||
        resp.encoding = self.BASE_ENCODING
 | 
					        context = browser.new_context()
 | 
				
			||||||
        soup = BeautifulSoup(resp.text, 'lxml')
 | 
					        page = context.new_page()
 | 
				
			||||||
 | 
					        page.goto(url=self.params["link"], wait_until='domcontentloaded')
 | 
				
			||||||
        link = soup.find('video').get("src")
 | 
					        link = page.get_attribute("xpath=//video", "src")
 | 
				
			||||||
        title = soup.find('video').get("id")
 | 
					        title = page.get_attribute("xpath=//video", "id")
 | 
				
			||||||
        return link, title
 | 
					        return link, title
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def video_download(self, link: str = None, title: str = None):
 | 
					    def video_download(self, link: str = None, title: str = None):
 | 
				
			||||||
 | 
					        with sync_playwright() as playwright:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if not link and not title:
 | 
					            if not link and not title:
 | 
				
			||||||
            link, title = self.get_video_link()
 | 
					                link, title = self.get_video_link(playwright)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")):
 | 
					        if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")):
 | 
				
			||||||
            raise FileAlreadyExistException(message=f"Yappy/{title}.mp4")
 | 
					            raise FileAlreadyExistException(message=f"Yappy/{title}.mp4")
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -31,6 +31,8 @@ class BaseParser:
 | 
				
			|||||||
            resolution = "NA"
 | 
					            resolution = "NA"
 | 
				
			||||||
        if "Yahoo" in ydl_opts["outtmpl"]["default"]:
 | 
					        if "Yahoo" in ydl_opts["outtmpl"]["default"]:
 | 
				
			||||||
            path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
 | 
					            path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
 | 
				
			||||||
 | 
					        elif "ZenYandex" in ydl_opts["outtmpl"]["default"]:
 | 
				
			||||||
 | 
					            path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
 | 
					            path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
 | 
				
			||||||
        if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):
 | 
					        if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,9 +1,11 @@
 | 
				
			|||||||
from collections import OrderedDict
 | 
					from collections import OrderedDict
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from src.parsers.Dzen.dzen_parser import DzenParser
 | 
				
			||||||
from src.parsers.MyMail.my_mail_parser import MyMailParser
 | 
					from src.parsers.MyMail.my_mail_parser import MyMailParser
 | 
				
			||||||
from src.parsers.Okru.ok_parser import OkParser
 | 
					from src.parsers.Okru.ok_parser import OkParser
 | 
				
			||||||
from src.parsers.Yahoo.yahoo_parser import YahooParser
 | 
					from src.parsers.Yahoo.yahoo_parser import YahooParser
 | 
				
			||||||
 | 
					from src.parsers.Yappy.yappy_parser import YappyParser
 | 
				
			||||||
from src.parsers.base_parser import BaseParser
 | 
					from src.parsers.base_parser import BaseParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -19,11 +21,12 @@ parser_mapping = OrderedDict(
 | 
				
			|||||||
        compile_regex(r"^ok.ru/okvideo/topic"): OkParser,
 | 
					        compile_regex(r"^ok.ru/okvideo/topic"): OkParser,
 | 
				
			||||||
        compile_regex(r"^ok.ru/video"): BaseParser,
 | 
					        compile_regex(r"^ok.ru/video"): BaseParser,
 | 
				
			||||||
        compile_regex(r"^...?likee.video/"): BaseParser,
 | 
					        compile_regex(r"^...?likee.video/"): BaseParser,
 | 
				
			||||||
        compile_regex(r"^dzen.ru/"): BaseParser,
 | 
					        compile_regex(r"^dzen.ru/"): DzenParser,
 | 
				
			||||||
        compile_regex(r"^yappy.media/"): BaseParser,
 | 
					        compile_regex(r"^yappy.media/"): YappyParser,
 | 
				
			||||||
        compile_regex(r"^yandex.ru/"): BaseParser,
 | 
					        compile_regex(r"^yandex.ru/"): BaseParser,
 | 
				
			||||||
        compile_regex(r"^.*\.yahoo.com/"): YahooParser,
 | 
					        compile_regex(r"^.*\.yahoo.com/"): YahooParser,
 | 
				
			||||||
        compile_regex(r"^.*\.livejournal.com/"): BaseParser,
 | 
					        compile_regex(r"^.*\.livejournal.com/"): BaseParser,
 | 
				
			||||||
 | 
					        compile_regex(r"^.*\.dzen.ru/"): BaseParser,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user