rework yappy_parser.py, Added dzen_parser.py, minor fixes
This commit is contained in:
parent
a3516f826a
commit
87e5fe7c9a
@ -30,21 +30,6 @@ class RedisClient:
|
|||||||
res = await connection.hgetall(queue_name)
|
res = await connection.hgetall(queue_name)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
# async def get_tasks(self) -> set:
|
|
||||||
# async with self.connection as connection:
|
|
||||||
# res = await connection.smembers(self.TASKS_NAME + f":1")
|
|
||||||
# return res
|
|
||||||
|
|
||||||
# async def get_task_done_queue(self) -> set:
|
|
||||||
# async with self.connection as connection:
|
|
||||||
# res = await connection.smembers(self.TASKS_DONE_NAME + f":1")
|
|
||||||
# return res
|
|
||||||
|
|
||||||
# async def del_task_from_queue(self, link, task: dict) -> int:
|
|
||||||
# async with self.connection as connection:
|
|
||||||
# res = await self._del_task(self.SET_NAME, json.dumps(task, indent=4).encode('utf-8'))
|
|
||||||
# return res
|
|
||||||
|
|
||||||
async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int:
|
async def del_task_from_queue_and_add_to_tasks(self, link: str, task: dict | list) -> int:
|
||||||
await self._del_task(self.SET_NAME, link)
|
await self._del_task(self.SET_NAME, link)
|
||||||
return await self._set_task(self.TASKS_NAME, link, task)
|
return await self._set_task(self.TASKS_NAME, link, task)
|
||||||
|
@ -4,21 +4,18 @@ from minio.error import S3Error
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
client = Minio(
|
client = Minio(
|
||||||
"s3.grfc.ru:443",
|
"grfc.ru",
|
||||||
access_key="cl-i-oculus-dev1",
|
access_key="cl-i-oculus-dev1",
|
||||||
secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
|
secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
|
||||||
secure=True
|
secure=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Make 'asiatrip' bucket if not exist.
|
|
||||||
found = client.bucket_exists("clean-internet-oculus-integration-dev")
|
found = client.bucket_exists("clean-internet-oculus-integration-dev")
|
||||||
if not found:
|
if not found:
|
||||||
client.make_bucket("clean-internet-oculus-integration-dev")
|
client.make_bucket("clean-internet-oculus-integration-dev")
|
||||||
else:
|
else:
|
||||||
print("Bucket 'clean-internet-oculus-integration-dev' already exists")
|
print("Bucket 'clean-internet-oculus-integration-dev' already exists")
|
||||||
|
|
||||||
# Upload '/home/user/Photos/asiaphotos.zip' as object name
|
|
||||||
# 'asiaphotos-2015.zip' to bucket 'asiatrip'.
|
|
||||||
client.fput_object(
|
client.fput_object(
|
||||||
"clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
|
"clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
|
||||||
)
|
)
|
||||||
|
39
src/parsers/Dzen/dzen_parser.py
Normal file
39
src/parsers/Dzen/dzen_parser.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
from playwright.sync_api import Playwright
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
from src.parsers.base_parser import BaseParser
|
||||||
|
|
||||||
|
|
||||||
|
class DzenParser(BaseParser):
|
||||||
|
BASE_DIR = os.path.abspath(f"downloads/Dzen")
|
||||||
|
|
||||||
|
def get_video_link(self, playwright: Playwright):
|
||||||
|
browser = playwright.chromium.launch(headless=True)
|
||||||
|
context = browser.new_context()
|
||||||
|
page = context.new_page()
|
||||||
|
page.goto(url=self.params["link"], wait_until='domcontentloaded')
|
||||||
|
link = page.text_content("xpath=//script[contains(text(), 'serverState')]")
|
||||||
|
links_json_starts_with = re.findall(r'\W\WStreamInfo', link)[0]
|
||||||
|
links_json_ends_with = re.findall(r'\W\W\W\WsocialInfo', link)[0]
|
||||||
|
_, _, link = link.partition(links_json_starts_with)
|
||||||
|
links, _, _ = link.partition(links_json_ends_with)
|
||||||
|
link = json.loads(links_json_starts_with + links)["StreamInfo"][-1]["OutputStream"]
|
||||||
|
title = json.loads(links_json_starts_with + links)["Uuid"]
|
||||||
|
return link, title
|
||||||
|
|
||||||
|
def video_download(self, link: str = None, title: str = None):
|
||||||
|
with sync_playwright() as playwright:
|
||||||
|
|
||||||
|
if not link and not title:
|
||||||
|
link, title = self.get_video_link(playwright)
|
||||||
|
|
||||||
|
base_link = self.params["link"]
|
||||||
|
self.params["link"] = link
|
||||||
|
self.params["outtmpl"] = f"downloads/ZenYandex/{title}_%(resolution)s.%(ext)s"
|
||||||
|
file_path = super().video_download()
|
||||||
|
self.params["link"] = base_link
|
||||||
|
return file_path
|
@ -2,6 +2,8 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from playwright.sync_api import Playwright
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
from src.exceptions.download_exceptions import FileAlreadyExistException
|
from src.exceptions.download_exceptions import FileAlreadyExistException
|
||||||
from src.parsers.base_parser import BaseParser
|
from src.parsers.base_parser import BaseParser
|
||||||
@ -9,19 +11,31 @@ from src.parsers.base_parser import BaseParser
|
|||||||
|
|
||||||
class YappyParser(BaseParser):
|
class YappyParser(BaseParser):
|
||||||
BASE_DIR = os.path.abspath(f"downloads/Yappy")
|
BASE_DIR = os.path.abspath(f"downloads/Yappy")
|
||||||
|
# OLD WAY
|
||||||
|
# def get_video_link(self):
|
||||||
|
# resp = requests.get(self.params["link"])
|
||||||
|
# resp.encoding = self.BASE_ENCODING
|
||||||
|
# soup = BeautifulSoup(resp.text, 'lxml')
|
||||||
|
#
|
||||||
|
# link = soup.find('video').get("src")
|
||||||
|
# title = soup.find('video').get("id")
|
||||||
|
# return link, title
|
||||||
|
|
||||||
def get_video_link(self):
|
def get_video_link(self, playwright: Playwright):
|
||||||
resp = requests.get(self.params["link"])
|
browser = playwright.chromium.launch(headless=True)
|
||||||
resp.encoding = self.BASE_ENCODING
|
context = browser.new_context()
|
||||||
soup = BeautifulSoup(resp.text, 'lxml')
|
page = context.new_page()
|
||||||
|
page.goto(url=self.params["link"], wait_until='domcontentloaded')
|
||||||
link = soup.find('video').get("src")
|
link = page.get_attribute("xpath=//video", "src")
|
||||||
title = soup.find('video').get("id")
|
title = page.get_attribute("xpath=//video", "id")
|
||||||
return link, title
|
return link, title
|
||||||
|
|
||||||
|
|
||||||
def video_download(self, link: str = None, title: str = None):
|
def video_download(self, link: str = None, title: str = None):
|
||||||
|
with sync_playwright() as playwright:
|
||||||
|
|
||||||
if not link and not title:
|
if not link and not title:
|
||||||
link, title = self.get_video_link()
|
link, title = self.get_video_link(playwright)
|
||||||
|
|
||||||
if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")):
|
if os.path.exists(os.path.join(os.getcwd() + f"/downloads/Yappy/{title}.mp4")):
|
||||||
raise FileAlreadyExistException(message=f"Yappy/{title}.mp4")
|
raise FileAlreadyExistException(message=f"Yappy/{title}.mp4")
|
||||||
|
@ -31,6 +31,8 @@ class BaseParser:
|
|||||||
resolution = "NA"
|
resolution = "NA"
|
||||||
if "Yahoo" in ydl_opts["outtmpl"]["default"]:
|
if "Yahoo" in ydl_opts["outtmpl"]["default"]:
|
||||||
path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
|
path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
|
||||||
|
elif "ZenYandex" in ydl_opts["outtmpl"]["default"]:
|
||||||
|
path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
|
||||||
else:
|
else:
|
||||||
path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
|
path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
|
||||||
if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):
|
if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from src.parsers.Dzen.dzen_parser import DzenParser
|
||||||
from src.parsers.MyMail.my_mail_parser import MyMailParser
|
from src.parsers.MyMail.my_mail_parser import MyMailParser
|
||||||
from src.parsers.Okru.ok_parser import OkParser
|
from src.parsers.Okru.ok_parser import OkParser
|
||||||
from src.parsers.Yahoo.yahoo_parser import YahooParser
|
from src.parsers.Yahoo.yahoo_parser import YahooParser
|
||||||
|
from src.parsers.Yappy.yappy_parser import YappyParser
|
||||||
from src.parsers.base_parser import BaseParser
|
from src.parsers.base_parser import BaseParser
|
||||||
|
|
||||||
|
|
||||||
@ -19,11 +21,12 @@ parser_mapping = OrderedDict(
|
|||||||
compile_regex(r"^ok.ru/okvideo/topic"): OkParser,
|
compile_regex(r"^ok.ru/okvideo/topic"): OkParser,
|
||||||
compile_regex(r"^ok.ru/video"): BaseParser,
|
compile_regex(r"^ok.ru/video"): BaseParser,
|
||||||
compile_regex(r"^...?likee.video/"): BaseParser,
|
compile_regex(r"^...?likee.video/"): BaseParser,
|
||||||
compile_regex(r"^dzen.ru/"): BaseParser,
|
compile_regex(r"^dzen.ru/"): DzenParser,
|
||||||
compile_regex(r"^yappy.media/"): BaseParser,
|
compile_regex(r"^yappy.media/"): YappyParser,
|
||||||
compile_regex(r"^yandex.ru/"): BaseParser,
|
compile_regex(r"^yandex.ru/"): BaseParser,
|
||||||
compile_regex(r"^.*\.yahoo.com/"): YahooParser,
|
compile_regex(r"^.*\.yahoo.com/"): YahooParser,
|
||||||
compile_regex(r"^.*\.livejournal.com/"): BaseParser,
|
compile_regex(r"^.*\.livejournal.com/"): BaseParser,
|
||||||
|
compile_regex(r"^.*\.dzen.ru/"): BaseParser,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user