rework yappy_parser.py, Added dzen_parser.py, minor fixes

This commit is contained in:
2023-10-10 03:41:55 +03:00
parent 002a7efb9c
commit b4bfde5bd2
6 changed files with 70 additions and 30 deletions

View File

@ -0,0 +1,39 @@
import json
import os
import re
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from src.parsers.base_parser import BaseParser
class DzenParser(BaseParser):
BASE_DIR = os.path.abspath(f"downloads/Dzen")
def get_video_link(self, playwright: Playwright):
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto(url=self.params["link"], wait_until='domcontentloaded')
link = page.text_content("xpath=//script[contains(text(), 'serverState')]")
links_json_starts_with = re.findall(r'\W\WStreamInfo', link)[0]
links_json_ends_with = re.findall(r'\W\W\W\WsocialInfo', link)[0]
_, _, link = link.partition(links_json_starts_with)
links, _, _ = link.partition(links_json_ends_with)
link = json.loads(links_json_starts_with + links)["StreamInfo"][-1]["OutputStream"]
title = json.loads(links_json_starts_with + links)["Uuid"]
return link, title
def video_download(self, link: str = None, title: str = None):
with sync_playwright() as playwright:
if not link and not title:
link, title = self.get_video_link(playwright)
base_link = self.params["link"]
self.params["link"] = link
self.params["outtmpl"] = f"downloads/ZenYandex/{title}_%(resolution)s.%(ext)s"
file_path = super().video_download()
self.params["link"] = base_link
return file_path

View File

@ -2,6 +2,8 @@ import os
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from src.exceptions.download_exceptions import FileAlreadyExistException
from src.parsers.base_parser import BaseParser
@ -9,19 +11,31 @@ from src.parsers.base_parser import BaseParser
class YappyParser(BaseParser):
BASE_DIR = os.path.abspath(f"downloads/Yappy")
# OLD WAY
# def get_video_link(self):
# resp = requests.get(self.params["link"])
# resp.encoding = self.BASE_ENCODING
# soup = BeautifulSoup(resp.text, 'lxml')
#
# link = soup.find('video').get("src")
# title = soup.find('video').get("id")
# return link, title
def get_video_link(self):
resp = requests.get(self.params["link"])
resp.encoding = self.BASE_ENCODING
soup = BeautifulSoup(resp.text, 'lxml')
link = soup.find('video').get("src")
title = soup.find('video').get("id")
def get_video_link(self, playwright: Playwright):
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto(url=self.params["link"], wait_until='domcontentloaded')
link = page.get_attribute("xpath=//video", "src")
title = page.get_attribute("xpath=//video", "id")
return link, title
def video_download(self, link: str = None, title: str = None):
if not link and not title:
link, title = self.get_video_link()
with sync_playwright() as playwright:
if not link and not title:
link, title = self.get_video_link(playwright)
if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")):
raise FileAlreadyExistException(message=f"Yappy/{title}.mp4")

View File

@ -31,6 +31,8 @@ class BaseParser:
resolution = "NA"
if "Yahoo" in ydl_opts["outtmpl"]["default"]:
path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
elif "ZenYandex" in ydl_opts["outtmpl"]["default"]:
path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
else:
path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):

View File

@ -1,9 +1,11 @@
from collections import OrderedDict
import re
from src.parsers.Dzen.dzen_parser import DzenParser
from src.parsers.MyMail.my_mail_parser import MyMailParser
from src.parsers.Okru.ok_parser import OkParser
from src.parsers.Yahoo.yahoo_parser import YahooParser
from src.parsers.Yappy.yappy_parser import YappyParser
from src.parsers.base_parser import BaseParser
@ -19,11 +21,12 @@ parser_mapping = OrderedDict(
compile_regex(r"^ok.ru/okvideo/topic"): OkParser,
compile_regex(r"^ok.ru/video"): BaseParser,
compile_regex(r"^...?likee.video/"): BaseParser,
compile_regex(r"^dzen.ru/"): BaseParser,
compile_regex(r"^yappy.media/"): BaseParser,
compile_regex(r"^dzen.ru/"): DzenParser,
compile_regex(r"^yappy.media/"): YappyParser,
compile_regex(r"^yandex.ru/"): BaseParser,
compile_regex(r"^.*\.yahoo.com/"): YahooParser,
compile_regex(r"^.*\.livejournal.com/"): BaseParser,
compile_regex(r"^.*\.dzen.ru/"): BaseParser,
}
)