added parsers for new social networks, rework master service
This commit is contained in:
		
							
								
								
									
										970
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										970
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -16,6 +16,12 @@ pika = "^1.3.2"
 | 
			
		||||
aio-pika = "^9.2.2"
 | 
			
		||||
setproctitle = "^1.3.2"
 | 
			
		||||
redis = "^5.0.0"
 | 
			
		||||
boto3 = "^1.28.36"
 | 
			
		||||
yt-dlp = "^2023.7.6"
 | 
			
		||||
pytest-playwright = "^0.4.2"
 | 
			
		||||
beautifulsoup4 = "^4.12.2"
 | 
			
		||||
lxml = "^4.9.3"
 | 
			
		||||
minio = "^7.1.16"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[build-system]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										53
									
								
								src/core/link_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								src/core/link_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,53 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import json
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from playwright.async_api import async_playwright
 | 
			
		||||
from playwright.async_api import Playwright
 | 
			
		||||
from aio_pika import Message, connect, DeliveryMode
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def run(playwright: Playwright):
 | 
			
		||||
    browser = await playwright.chromium.launch(headless=False)
 | 
			
		||||
    context = await browser.new_context()
 | 
			
		||||
    page = await context.new_page()
 | 
			
		||||
    await page.goto(url="https://m.my.mail.ru/v/topclips/video/alltop/68100.html")
 | 
			
		||||
    # await page.goto(url="https://www.youtube.com/shorts/vJU0Sr3WvmU")
 | 
			
		||||
    video = await page.get_attribute("xpath=//video", "src")
 | 
			
		||||
    connection = await connect("amqp://guest:guest@localhost/")
 | 
			
		||||
    title = await page.title()
 | 
			
		||||
    async with connection:
 | 
			
		||||
        for i in range(10):
 | 
			
		||||
            url = page.url
 | 
			
		||||
            body = {
 | 
			
		||||
                "link": url,
 | 
			
		||||
                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
 | 
			
		||||
                "merge_output_format": "mp4",
 | 
			
		||||
                "outtmpl": f"downloads/%(extractor_key)s/%(id)s_%(width)sp.%(ext)s",
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # Creating a channel
 | 
			
		||||
            channel = await connection.channel()
 | 
			
		||||
 | 
			
		||||
            # Sending the message
 | 
			
		||||
            message = Message(
 | 
			
		||||
                json.dumps(body, indent=4).encode('utf-8'), delivery_mode=DeliveryMode.PERSISTENT,
 | 
			
		||||
            )
 | 
			
		||||
            await channel.default_exchange.publish(
 | 
			
		||||
                message,
 | 
			
		||||
                routing_key='hello',
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            print(f" [x] Sent '{body}'")
 | 
			
		||||
            await page.keyboard.press("ArrowDown")
 | 
			
		||||
 | 
			
		||||
            while title == await page.title():
 | 
			
		||||
                await page.title()
 | 
			
		||||
 | 
			
		||||
async def main():
 | 
			
		||||
    async with async_playwright() as playwright:
 | 
			
		||||
        await run(playwright)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
asyncio.run(main())
 | 
			
		||||
@@ -1,6 +1,5 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import concurrent.futures as pool
 | 
			
		||||
import os.path
 | 
			
		||||
import subprocess
 | 
			
		||||
 | 
			
		||||
from functools import partial
 | 
			
		||||
@@ -11,8 +10,10 @@ from src.core.async_queue import AsyncQueue
 | 
			
		||||
from src.core.rabbitmq import get_messages
 | 
			
		||||
from src.core.redis_client import RedisClient
 | 
			
		||||
from src.core.result import Result, ResultTypeEnum
 | 
			
		||||
from src.core.ydl import VideoDownloader
 | 
			
		||||
from src.exceptions.download_exceptions import SiteNotImplementedException
 | 
			
		||||
from src.parsers.MyMail.my_mail_parser import MyMailParser
 | 
			
		||||
from src.parsers.Yappy.yappy_parser import YappyParser
 | 
			
		||||
from src.parsers.base_parser import BaseParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MasterService:
 | 
			
		||||
@@ -53,21 +54,9 @@ class MasterService:
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def video_download(video_params: dict):
 | 
			
		||||
        ydl_opts = {
 | 
			
		||||
            "format": video_params["format"],
 | 
			
		||||
            "merge_output_format": video_params["merge_output_format"],
 | 
			
		||||
            'outtmpl': video_params["outtmpl"],
 | 
			
		||||
            "quiet": True
 | 
			
		||||
        }
 | 
			
		||||
        downloader = VideoDownloader(link=video_params["link"], ydl_opts=ydl_opts)
 | 
			
		||||
        video_info = downloader.get_info()
 | 
			
		||||
        if os.path.exists(
 | 
			
		||||
                os.path.join(os.getcwd() + f"Youtube/{video_info['id']}_{video_info['width']}.{video_info['ext']}")
 | 
			
		||||
        ):
 | 
			
		||||
            return Result(result_type=ResultTypeEnum.EXIST)
 | 
			
		||||
        downloader: BaseParser | YappyParser | MyMailParser = MasterService.get_parser(video_params)
 | 
			
		||||
        try:
 | 
			
		||||
            downloader.ydl_opts["quiet"] = False
 | 
			
		||||
            result = downloader.download()
 | 
			
		||||
            result = downloader.video_download()
 | 
			
		||||
            return result
 | 
			
		||||
        except SiteNotImplementedException as ex:
 | 
			
		||||
            raise HTTPException(
 | 
			
		||||
@@ -75,6 +64,15 @@ class MasterService:
 | 
			
		||||
                detail=ex.message
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def get_parser(params: dict):
 | 
			
		||||
        parser_mapping = {
 | 
			
		||||
            "MyMailRu": MyMailParser(params),
 | 
			
		||||
            "base": BaseParser(params),
 | 
			
		||||
            "Yappy": YappyParser(params),
 | 
			
		||||
        }
 | 
			
		||||
        return parser_mapping[params["parser"]]
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def video_processing_executor(video_params: dict):
 | 
			
		||||
        try:
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										35
									
								
								src/core/uploader.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								src/core/uploader.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,35 @@
 | 
			
		||||
from minio import Minio
 | 
			
		||||
from minio.error import S3Error
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    client = Minio(
 | 
			
		||||
        "s3.grfc.ru:443",
 | 
			
		||||
        access_key="cl-i-oculus-dev1",
 | 
			
		||||
        secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
 | 
			
		||||
        secure=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Make 'asiatrip' bucket if not exist.
 | 
			
		||||
    found = client.bucket_exists("clean-internet-oculus-integration-dev")
 | 
			
		||||
    if not found:
 | 
			
		||||
        client.make_bucket("clean-internet-oculus-integration-dev")
 | 
			
		||||
    else:
 | 
			
		||||
        print("Bucket 'asiatrip' already exists")
 | 
			
		||||
 | 
			
		||||
    # Upload '/home/user/Photos/asiaphotos.zip' as object name
 | 
			
		||||
    # 'asiaphotos-2015.zip' to bucket 'asiatrip'.
 | 
			
		||||
    client.fput_object(
 | 
			
		||||
        "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
 | 
			
		||||
    )
 | 
			
		||||
    print(
 | 
			
		||||
        "'/home/user/Photos/asiaphotos.zip' is successfully uploaded as "
 | 
			
		||||
        "object 'asiaphotos-2015.zip' to bucket 'asiatrip'."
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    try:
 | 
			
		||||
        main()
 | 
			
		||||
    except S3Error as exc:
 | 
			
		||||
        print("error occurred.", exc)
 | 
			
		||||
@@ -1,18 +1,14 @@
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import uuid
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
import youtube_dl
 | 
			
		||||
 | 
			
		||||
from src.exceptions.download_exceptions import SiteNotImplementedException
 | 
			
		||||
from yt_dlp import YoutubeDL
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class VideoDownloader:
 | 
			
		||||
    SUPPORTING_WEBSITES = [
 | 
			
		||||
        "ok.ru", "vk.com", "www.youtube.com",
 | 
			
		||||
        "ok.ru", "vk.com", "www.youtube.com", "livejournal.com"
 | 
			
		||||
    ]
 | 
			
		||||
    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 | 
			
		||||
    BASE_DOWNLOAD_DIR = os.path.join(BASE_DIR, "downloads")
 | 
			
		||||
@@ -25,15 +21,15 @@ class VideoDownloader:
 | 
			
		||||
        self.password = password
 | 
			
		||||
 | 
			
		||||
    def get_info(self):
 | 
			
		||||
        with youtube_dl.YoutubeDL(self.ydl_opts if self.ydl_opts else {}) as ydl:
 | 
			
		||||
        with YoutubeDL(self.ydl_opts if self.ydl_opts else {}) as ydl:
 | 
			
		||||
            return ydl.extract_info(self.link, download=False)
 | 
			
		||||
 | 
			
		||||
    def download(self):
 | 
			
		||||
        domain = urlparse(self.link).netloc
 | 
			
		||||
        if domain not in self.SUPPORTING_WEBSITES:
 | 
			
		||||
            raise SiteNotImplementedException
 | 
			
		||||
        # if domain not in self.SUPPORTING_WEBSITES:
 | 
			
		||||
        #     raise SiteNotImplementedException
 | 
			
		||||
 | 
			
		||||
        with youtube_dl.YoutubeDL(self.ydl_opts if self.ydl_opts else {}) as ydl:
 | 
			
		||||
        with YoutubeDL(self.ydl_opts if self.ydl_opts else {}) as ydl:
 | 
			
		||||
            ydl.download([self.link])
 | 
			
		||||
            result = ydl.extract_info(self.link, download=False)
 | 
			
		||||
            return result
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										43
									
								
								src/parsers/MyMail/my_mail_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								src/parsers/MyMail/my_mail_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,43 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
from http.cookies import SimpleCookie
 | 
			
		||||
 | 
			
		||||
from playwright.sync_api import Playwright
 | 
			
		||||
from playwright.sync_api import sync_playwright
 | 
			
		||||
 | 
			
		||||
from src.core.result import Result, ResultTypeEnum
 | 
			
		||||
from src.parsers.base_parser import BaseParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MyMailParser(BaseParser):
 | 
			
		||||
    BASE_DIR = os.path.abspath(f"downloads/MyMailRu")
 | 
			
		||||
 | 
			
		||||
    def get_video_link(self, playwright: Playwright):
 | 
			
		||||
        browser = playwright.chromium.launch(headless=True)
 | 
			
		||||
        context = browser.new_context()
 | 
			
		||||
        page = context.new_page()
 | 
			
		||||
        mobile_url = f"{self.params['link'][0:8]}m.{self.params['link'][8:]}"
 | 
			
		||||
        page.goto(url=mobile_url)
 | 
			
		||||
        link = page.get_attribute("xpath=//video", "src")
 | 
			
		||||
        link = "https:" + link
 | 
			
		||||
        title = page.locator("xpath=//div[@class='event-text__title']").text_content()
 | 
			
		||||
        return link, title
 | 
			
		||||
 | 
			
		||||
    def video_download(self, link: str = None, title: str = None):
 | 
			
		||||
        if not link and not title:
 | 
			
		||||
            with sync_playwright() as playwright:
 | 
			
		||||
                link, title = self.get_video_link(playwright)
 | 
			
		||||
 | 
			
		||||
        if os.path.exists(os.path.join(os.getcwd() + f"MyMailRu/{title}.mp4")):
 | 
			
		||||
            return Result(result_type=ResultTypeEnum.EXIST)
 | 
			
		||||
 | 
			
		||||
        rawdata = "searchuid=4515257701686610918; p=ki8AAAYkJdcA; act=064d11655c924c9f8f2aad0181a06a4b; o=:1763:AUAQ.m; oid=22SgCdFE5g2ZEFHy1FkYW; mrcu=5A5B64F228CC3AC265485FC5AE55; re_theme=system; re_theme_actual=dark; s=fver=0|rt=1|dpr=2|ww=1728|wh=963; ph_tp_horo-mail-ru=t=1|d=1693591768453; tmr_lvid=26ef811c203f1c0c0e5d1c8af1a4671b; tmr_lvidTS=1693591768481; _ym_uid=1693591769619458564; _ym_d=1693591769; ph_v_my-mail-ru=1; mrhc=CB75tAx8UrwCaiqE85YXWoCM2+CTT6/VsTcMdxv4iCM=; mr_my_b=1; _ga=GA1.2.2000378679.1694259228; mtrc=%7B%22mytrackerid%22%3A52867%2C%22tmr_lvid%22%3A%2226ef811c203f1c0c0e5d1c8af1a4671b%22%7D; c=FuoAZQEAsHsTAAAUAQgACQAAgLrElILY4CDYNvMTASDQrSUa; b=nUwAAJBoPmMDosXR5CCG5oQO4ltqxSBq54QOYpLl6yBiWW0VIvzl6zAOfNwNOtuHt6ADAAAIpgR06GAGp1YMpgB06AAA; i=AQAQDgNlCQATAAguDyABAYwDApADARgHAewHATwJAUMLARkUAXseAjAgAfUgAfYgAfcgAfggAfEiAZMCCHYnbgABAQIBAgIBBwIBCAIBCQIBDgIBDwIBEQIBEgIBFwIBGAIBUQUBVgUBaAUBdAUBdQUBoAUBoQUBpAUBpgUBqQUBegYBDgsBKQsBLgsBxQsBxwsByQsBzAsBzQsBcA0BdQ0BeA0BvQ0B6BAB6RAB6hABw2MB3AQIBAEBAAHhBAkBAeIECgQGB80HOgUIDQQqAgEACAELCAEeEAHWBggEAQEAAb0HCAQBoxUBiQ0FAgHz; video_key=192bed9054db7a4efa7943ad834c7a2e05a55237; VID=0eXAI6071-IK00000t1kP4oK:::0-0-a1b105a-9aaf457:CAASEL33YAsZEz357mCA71F8QJgacM9HfhwzMJ-j3X3e-iJIE0DIiLWfRhfTc3GgyUNfH8_EwadLkVinwp0LA-QyaRe9p0A_ZR0y1i9Hk8aVl8Q8ZB_Qd_hCZN_SfHmeOvHeoe6QBCvz5w2SHcI2iFuAXKJkJMvNuYwSeBLdWhCXvsK5M_M"
 | 
			
		||||
        cookie = SimpleCookie()
 | 
			
		||||
        cookie.load(rawdata)
 | 
			
		||||
        cookies = {k: v.value for k, v in cookie.items()}
 | 
			
		||||
 | 
			
		||||
        self.make_sure_path_exists()
 | 
			
		||||
        video_response = requests.get(link, cookies=cookies)
 | 
			
		||||
        with open(self.BASE_DIR + f"/{title}.mp4", "wb") as output:
 | 
			
		||||
            output.write(video_response.content)
 | 
			
		||||
							
								
								
									
										33
									
								
								src/parsers/Yappy/yappy_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								src/parsers/Yappy/yappy_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,33 @@
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
from src.core.result import ResultTypeEnum, Result
 | 
			
		||||
from src.parsers.base_parser import BaseParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class YappyParser(BaseParser):
 | 
			
		||||
    BASE_DIR = os.path.abspath(f"downloads/Yappy")
 | 
			
		||||
 | 
			
		||||
    def get_video_link(self):
 | 
			
		||||
        resp = requests.get(self.params["link"])
 | 
			
		||||
        resp.encoding = self.BASE_ENCODING
 | 
			
		||||
        soup = BeautifulSoup(resp.text, 'lxml')
 | 
			
		||||
 | 
			
		||||
        link = soup.find('video').get("src")
 | 
			
		||||
        title = soup.find('title').get_text()
 | 
			
		||||
        return link, title
 | 
			
		||||
 | 
			
		||||
    def video_download(self, link: str = None, title: str = None):
 | 
			
		||||
        if not link and not title:
 | 
			
		||||
            link, title = self.get_video_link()
 | 
			
		||||
 | 
			
		||||
        if os.path.exists(os.path.join(os.getcwd() + f"Yappy/{title}.mp4")):
 | 
			
		||||
            return Result(result_type=ResultTypeEnum.EXIST)
 | 
			
		||||
 | 
			
		||||
        video_response = requests.get(link)
 | 
			
		||||
        self.make_sure_path_exists()
 | 
			
		||||
        with open(self.BASE_DIR + f"/{title}.mp4", "wb") as output:
 | 
			
		||||
            output.write(video_response.content)
 | 
			
		||||
							
								
								
									
										46
									
								
								src/parsers/base_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								src/parsers/base_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,46 @@
 | 
			
		||||
import errno
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from fastapi import HTTPException
 | 
			
		||||
 | 
			
		||||
from src.core.result import ResultTypeEnum, Result
 | 
			
		||||
from src.core.ydl import VideoDownloader
 | 
			
		||||
from src.exceptions.download_exceptions import SiteNotImplementedException
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BaseParser:
 | 
			
		||||
    BASE_ENCODING = 'utf-8'
 | 
			
		||||
    BASE_DIR = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, params: dict):
 | 
			
		||||
        self.params = params
 | 
			
		||||
 | 
			
		||||
    def video_download(self):
 | 
			
		||||
        ydl_opts = {
 | 
			
		||||
            "format": self.params["format"],
 | 
			
		||||
            "merge_output_format": self.params["merge_output_format"],
 | 
			
		||||
            'outtmpl': self.params["outtmpl"],
 | 
			
		||||
            "quiet": True
 | 
			
		||||
        }
 | 
			
		||||
        downloader = VideoDownloader(link=self.params["link"], ydl_opts=ydl_opts)
 | 
			
		||||
        video_info = downloader.get_info()
 | 
			
		||||
        if os.path.exists(
 | 
			
		||||
                os.path.join(os.getcwd() + f"Youtube/{video_info['id']}_{video_info['width']}.{video_info['ext']}")
 | 
			
		||||
        ):
 | 
			
		||||
            return Result(result_type=ResultTypeEnum.EXIST)
 | 
			
		||||
        try:
 | 
			
		||||
            downloader.ydl_opts["quiet"] = False
 | 
			
		||||
            result = downloader.download()
 | 
			
		||||
            return result
 | 
			
		||||
        except SiteNotImplementedException as ex:
 | 
			
		||||
            raise HTTPException(
 | 
			
		||||
                status_code=400,
 | 
			
		||||
                detail=ex.message
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def make_sure_path_exists(self,):
 | 
			
		||||
        try:
 | 
			
		||||
            os.makedirs(self.BASE_DIR)
 | 
			
		||||
        except OSError as exception:
 | 
			
		||||
            if exception.errno != errno.EEXIST:
 | 
			
		||||
                raise
 | 
			
		||||
		Reference in New Issue
	
	Block a user