From f1fdcf8fb4108cecbf15fadffe3845462a5c5c26 Mon Sep 17 00:00:00 2001
From: garickbadalov
Date: Tue, 21 Nov 2023 22:21:00 +0300
Subject: [PATCH] added s3_client, refactored web and master services
---
poetry.lock | 16 ++++++++-
pyproject.toml | 1 +
src/core/config.py | 13 +++++++
src/core/master_service.py | 10 ++++--
src/core/redis_client.py | 4 +++
src/core/s3_client.py | 66 +++++++++++++++++++++++++++++++++++
src/core/uploader.py | 32 -----------------
src/parsers/Okru/ok_parser.py | 15 ++++++--
src/parsers/base_parser.py | 13 ++++---
src/web/main.py | 56 ++++++++++++++++++++---------
src/web/schemes/submit.py | 15 ++++++--
src/web/templates/index.html | 12 +++----
12 files changed, 186 insertions(+), 67 deletions(-)
create mode 100644 src/core/config.py
create mode 100644 src/core/s3_client.py
delete mode 100644 src/core/uploader.py
diff --git a/poetry.lock b/poetry.lock
index bc6e8d3..6af8923 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1680,6 +1680,20 @@ files = [
[package.dependencies]
six = ">=1.5"
+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+ {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
[[package]]
name = "python-multipart"
version = "0.0.6"
@@ -2327,4 +2341,4 @@ websockets = "*"
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
-content-hash = "272fe31fba150b0b0fcca1b7d60f706dc2a05ea730ef19e34ccb8e5524f47d66"
+content-hash = "b7973dc522b312b75a798bc966c5001b24e134cccd332de7b978e5a1ec495b57"
diff --git a/pyproject.toml b/pyproject.toml
index a9365e7..64dac5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ ply = "3.11"
ruamel-yaml = "0.17.21"
flask-login = "0.6.2"
pycryptodome = "3.18.0"
+python-dotenv = "^1.0.0"
[build-system]
diff --git a/src/core/config.py b/src/core/config.py
new file mode 100644
index 0000000..fb36a12
--- /dev/null
+++ b/src/core/config.py
@@ -0,0 +1,13 @@
+import os
+
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+
+S3_HOST = os.environ.get("S3_HOST", "s3-api.grfc.ru")
+S3_ACCESS_KEY = os.environ.get("S3_ACCESS_KEY", "cl-i-oculus-dev1")
+S3_SECRET_KEY = os.environ.get("S3_SECRET_KEY", "Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM")
+S3_BUCKET_NAME = os.environ.get("S3_BUCKET_NAME", "clean-internet-oculus-integration-dev")
+DEFAULT_DURATION = os.environ.get("DEFAULT_DURATION", 600)
diff --git a/src/core/master_service.py b/src/core/master_service.py
index 6419ac8..ca54d3d 100644
--- a/src/core/master_service.py
+++ b/src/core/master_service.py
@@ -1,6 +1,7 @@
import asyncio
import concurrent.futures as pool
import json
+import os
import subprocess
import traceback
@@ -12,6 +13,7 @@ from src.core.async_queue import AsyncQueue
from src.core.rabbitmq import get_messages, publish_message_with_task_done
from src.core.redis_client import RedisClient
from src.core.result import Result, ResultTypeEnum
+from src.core.s3_client import S3Client
from src.exceptions.download_exceptions import FileAlreadyExistException, SiteNotImplementedException
from src.parsers.MyMail.my_mail_parser import MyMailParser
from src.parsers.Yappy.yappy_parser import YappyParser
@@ -50,7 +52,10 @@ class MasterService:
await asyncio.gather(self.rabbit_consumer(self.queue), *tasks)
- async def result_processing(self, result: Result | list, redis: RedisClient, video_params: dict):
+ async def result_processing(self, result: Result | list, redis: RedisClient, s3_client: S3Client, video_params: dict):
+ file_path = os.path.join(os.getcwd() + "/downloads/")
+ links_to_download = s3_client.upload(file_name=result.value["result"], file_path=file_path)
+ result.value["result"] = links_to_download
await redis.del_task_from_tasks_and_add_to_task_done(task=result.value, link=video_params["link"])
await publish_message_with_task_done(task=result.value)
self.queue.task_done()
@@ -59,6 +64,7 @@ class MasterService:
while True:
video_params = await self.queue.get()
redis = RedisClient()
+ s3_client = S3Client()
await redis.del_task_from_queue_and_add_to_tasks(link=video_params["link"], task=video_params)
self.currently_underway[video_params['link']] = video_params
@@ -67,7 +73,7 @@ class MasterService:
))
result: Result = await download_task
- await self.result_processing(result, redis, video_params)
+ await self.result_processing(result, redis, s3_client, video_params)
if video_params['link'] in self.currently_underway:
del self.currently_underway[video_params['link']]
diff --git a/src/core/redis_client.py b/src/core/redis_client.py
index c304de8..adb17c1 100644
--- a/src/core/redis_client.py
+++ b/src/core/redis_client.py
@@ -46,3 +46,7 @@ class RedisClient:
async with self.connection as connection:
res = await connection.delete(self.TASKS_NAME)
return res
+
+ async def update_task_in_tasks_done(self, task: dict | list, link: str) -> int:
+ await self._del_task(self.TASKS_DONE_NAME, link)
+ return await self._set_task(self.TASKS_DONE_NAME, link, task)
diff --git a/src/core/s3_client.py b/src/core/s3_client.py
new file mode 100644
index 0000000..21ffe95
--- /dev/null
+++ b/src/core/s3_client.py
@@ -0,0 +1,66 @@
+from loguru import logger
+from minio import Minio
+from minio.commonconfig import CopySource
+
+from src.core.config import S3_HOST, S3_ACCESS_KEY, S3_SECRET_KEY, S3_BUCKET_NAME
+
+
+class S3Client:
+ HOST = S3_HOST
+ ACCESS_KEY = S3_ACCESS_KEY
+ SECRET_KEY = S3_SECRET_KEY
+ BUCKET_NAME = S3_BUCKET_NAME
+
+ def __init__(self):
+ self.client = Minio(
+ self.HOST,
+ access_key=self.ACCESS_KEY,
+ secret_key=self.SECRET_KEY,
+ secure=True
+ )
+
+ def _make_sure_bucket_exist(self):
+ found = self.client.bucket_exists(self.BUCKET_NAME)
+ if not found:
+ self.client.make_bucket(self.BUCKET_NAME)
+ else:
+ logger.info(f"Bucket {self.BUCKET_NAME} already exists")
+
+ def upload(self, file_name: str, file_path: str | list[str]):
+ self._make_sure_bucket_exist()
+ if isinstance(file_name, str):
+ file_path = file_path + file_name
+ self.client.fput_object(self.BUCKET_NAME, file_name, file_path)
+ logger.info(f"{file_path} is successfully uploaded as object {file_name} to bucket {self.BUCKET_NAME}.")
+ link_to_download = self.get(file_name)
+ return link_to_download
+ else:
+ result = []
+ for file_name_part in file_name:
+ current_file_path = file_path + file_name_part
+ self.client.fput_object(self.BUCKET_NAME, file_name_part, current_file_path)
+ logger.info(f"{current_file_path} is successfully uploaded as object {file_name_part} to bucket {self.BUCKET_NAME}.")
+ link_to_download = self.get(file_name_part)
+ result.append(link_to_download)
+ return result
+
+ def get(self, file_name: str):
+ self._make_sure_bucket_exist()
+ result = self.client.get_presigned_url(
+ "GET",
+ self.BUCKET_NAME,
+ file_name,
+ )
+ return result
+
+ def delete(self, file_name: str):
+ result = self.client.remove_object(self.BUCKET_NAME, file_name)
+ return result
+
+ def copy_to_another_bucket(self, file_name: str, bucket_name):
+ result = self.client.copy_object(
+ bucket_name,
+ file_name,
+ CopySource(self.BUCKET_NAME, file_name),
+ )
+ return result
diff --git a/src/core/uploader.py b/src/core/uploader.py
deleted file mode 100644
index fe05fc6..0000000
--- a/src/core/uploader.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from minio import Minio
-from minio.error import S3Error
-
-
-def main():
- client = Minio(
- "grfc.ru",
- access_key="cl-i-oculus-dev1",
- secret_key="Nom8qKEU6IYtQSrNt5ZPN1XncQTZdtUM",
- secure=True
- )
-
- found = client.bucket_exists("clean-internet-oculus-integration-dev")
- if not found:
- client.make_bucket("clean-internet-oculus-integration-dev")
- else:
- print("Bucket 'clean-internet-oculus-integration-dev' already exists")
-
- client.fput_object(
- "clean-internet-oculus-integration-dev", "4uv2GNc_ybc_1080p.mp4", "/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4",
- )
- print(
- "'/Users/garickbadalov/PycharmProjects/video_downloader_service/downloads/Youtube/4uv2GNc_ybc_1080p.mp4' is successfully uploaded as "
- "object '4uv2GNc_ybc_1080p.mp4' to bucket 'clean-internet-oculus-integration-dev'."
- )
-
-
-if __name__ == "__main__":
- try:
- main()
- except S3Error as exc:
- print("error occurred.", exc)
\ No newline at end of file
diff --git a/src/parsers/Okru/ok_parser.py b/src/parsers/Okru/ok_parser.py
index 1697ec6..9973b0f 100644
--- a/src/parsers/Okru/ok_parser.py
+++ b/src/parsers/Okru/ok_parser.py
@@ -1,8 +1,10 @@
import os
+import re
import requests
from bs4 import BeautifulSoup
+from lxml import etree
from src.exceptions.download_exceptions import FileAlreadyExistException
from src.parsers.base_parser import BaseParser
@@ -16,9 +18,16 @@ class OkParser(BaseParser):
resp = requests.get(self.params["link"])
resp.encoding = self.BASE_ENCODING
soup = BeautifulSoup(resp.text, 'lxml')
- required_div = [div for div in soup.find_all('div', {'class': 'invisible'}) if len(div['class']) < 2][0]
- video_tags = required_div.find('span').find_all_next('span', {'itemprop': "video"})
- links = [video_tag.find('a').get("href") for video_tag in video_tags]
+ if "topic" in self.params["link"]:
+ dom = etree.HTML(str(soup))
+ elements_with_video_id = dom.xpath(
+ "//div[@class='mlr_cnt']/div[contains(@data-l, 'gA,VIDEO,mB,movie,ti,')]/div[@class='vid-card "
+ "vid-card__xl']/div[@class='video-card_n-w']/a[contains(@onclick, 'OK.VideoPlayer.openMovie')]")
+ links = ["https://ok.ru/video/" + re.findall('\d+', elem.get("onclick"))[0] for elem in elements_with_video_id]
+ else:
+ required_div = [div for div in soup.find_all('div', {'class': 'invisible'}) if len(div['class']) < 2][0]
+ video_tags = required_div.find('span').find_all_next('span', {'itemprop': "video"})
+ links = [video_tag.find('a').get("href") for video_tag in video_tags]
return links
except Exception as ex:
raise
diff --git a/src/parsers/base_parser.py b/src/parsers/base_parser.py
index 1adca12..0f9417f 100644
--- a/src/parsers/base_parser.py
+++ b/src/parsers/base_parser.py
@@ -2,7 +2,9 @@ import errno
import os
from loguru import logger
+from yt_dlp import download_range_func
+from src.core.config import DEFAULT_DURATION
from src.core.ydl import VideoDownloader
from src.exceptions.download_exceptions import FileAlreadyExistException
@@ -20,6 +22,7 @@ class BaseParser:
"logger": logger,
"merge_output_format": self.params["merge_output_format"],
'outtmpl': self.params["outtmpl"],
+ 'download_ranges': download_range_func(None, [(0, int(DEFAULT_DURATION))]),
# "quiet": True
}
downloader = VideoDownloader(link=self.params["link"], ydl_opts=ydl_opts)
@@ -29,14 +32,16 @@ class BaseParser:
resolution = downloader.info['resolution']
else:
resolution = "NA"
+
+ base_file_name = f"{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
if "Yahoo" in ydl_opts["outtmpl"]["default"]:
- path_to_video = f"Yahoo/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
+ path_to_video = f"Yahoo/{base_file_name}"
elif "ZenYandex" in ydl_opts["outtmpl"]["default"]:
- path_to_video = f"ZenYandex/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
+ path_to_video = f"ZenYandex/{base_file_name}"
elif "Bing" in ydl_opts["outtmpl"]["default"]:
- path_to_video = f"Bing/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
+ path_to_video = f"Bing/{base_file_name}"
else:
- path_to_video = f"{downloader.info['extractor_key']}/{downloader.info['id']}_{resolution}.{downloader.info['ext']}"
+ path_to_video = f"{downloader.info['extractor_key']}/{base_file_name}"
if os.path.exists(os.path.join(os.getcwd() + "/downloads/" + path_to_video)):
raise FileAlreadyExistException(message=path_to_video)
downloader.ydl_opts["quiet"] = False
diff --git a/src/web/main.py b/src/web/main.py
index abb62d5..161266c 100644
--- a/src/web/main.py
+++ b/src/web/main.py
@@ -1,16 +1,16 @@
import json
-import os
import uvicorn
import logging
from aio_pika import connect, Message, DeliveryMode
from fastapi import FastAPI, Request, Depends
from starlette.middleware.cors import CORSMiddleware
-from starlette.responses import JSONResponse, FileResponse, StreamingResponse
+from starlette.responses import JSONResponse, FileResponse, Response
from starlette.templating import Jinja2Templates
from src.core.redis_client import RedisClient
-from src.web.schemes.submit import SubmitIn, CheckIn
+from src.core.s3_client import S3Client
+from src.web.schemes.submit import SubmitIn, CheckIn, DeleteFromS3, CopyToAnotherBucketS3
app = FastAPI(
title="video_downloader", openapi_url=f"/api/v1/openapi.json"
@@ -82,6 +82,7 @@ async def index(request: Request):
@app.post('/submit')
async def get_url_for_download_video(request: Request, data: SubmitIn = Depends()):
red = RedisClient()
+ s3_client = S3Client()
task_done = await is_task_already_done_or_exist(red, data.link)
# TODO: где-то не обновился статус после выполнения\провала задачи
task_in_process = await is_task_in_process(red, data.link)
@@ -89,9 +90,15 @@ async def get_url_for_download_video(request: Request, data: SubmitIn = Depends(
return JSONResponse(status_code=202, content={"result": "Задача в работе. Ожидайте"})
if task_done:
if isinstance(task_done["result"], str):
- links_to_download_video = [str(request.base_url) + "get/?file_path=" + task_done["result"]]
+ file_name = task_done["result"][task_done["result"].index("dev/") + 4:task_done["result"].index("?")]
+ links_to_download_video = [s3_client.get(file_name)]
+ task_done["result"] = links_to_download_video[0]
else:
- links_to_download_video = [str(request.base_url) + "get/?file_path=" + path for path in task_done["result"]]
+ file_names = [task_done_part[task_done_part.index("dev/") + 4:task_done_part.index("?")] for
+ task_done_part in task_done["result"]]
+ links_to_download_video = [s3_client.get(file_name) for file_name in file_names]
+ task_done["result"] = links_to_download_video
+ await red.update_task_in_tasks_done(task_done, task_done["link"])
return JSONResponse({"result": links_to_download_video})
# TODO: учесть, что если делать запрос CURL\urllib3\etc, в теле может быть несколько ссылок -> должно быть создано несколько задач
@@ -104,6 +111,7 @@ async def get_url_for_download_video(request: Request, data: SubmitIn = Depends(
"format": f"bv[width={data.resolution.value}][ext={data.video_format.value}]+ba[ext={data.audio_format.value}]/"
f"bv[width={data.resolution.value}][ext=mp4]+ba[ext=m4a]/"
f"bv[width={data.resolution.value}][ext=webm]+ba[ext=webm]/"
+ f"best[width={data.resolution.value}]/"
f"best[ext={data.video_format.value}]/"
f"best[ext!=unknown_video]",
"merge_output_format": data.merge_output_format.value,
@@ -132,15 +140,10 @@ async def get_url_for_download_video(request: Request, data: SubmitIn = Depends(
@app.get('/get/', response_class=FileResponse, status_code=200)
async def download_video(file_path):
- base = os.path.dirname(os.path.dirname(os.path.abspath(file_path)))
- base_download_dir = os.path.join(base, os.pardir, os.pardir, "downloads")
-
- def iterfile():
- with open(base_download_dir + f'/{file_path}', mode="rb") as file_like:
- yield from file_like
-
- return StreamingResponse(iterfile(), headers={'Content-Disposition': f'inline; filename="{file_path}"'},
- media_type="video")
+ s3_client = S3Client()
+ file_response = s3_client.get(file_path)
+ return Response(content=file_response.data, headers={'Content-Disposition': f'inline; filename="{file_path}"'},
+ media_type="video")
@app.post('/check/', response_class=FileResponse, status_code=200)
@@ -168,9 +171,9 @@ async def download_video(data: CheckIn, request: Request):
content={"result": f"Задача выполнена с ошибкой, попробуйте загрузить еще раз"})
if tasks_done and data.link in tasks_done:
if isinstance(tasks_done[data.link]["result"], str):
- links_to_download_video = [str(request.base_url) + "get/?file_path=" + tasks_done[data.link]["result"]]
+ links_to_download_video = [tasks_done[data.link]["result"]]
else:
- links_to_download_video = [str(request.base_url) + "get/?file_path=" + path for path in
+ links_to_download_video = [link for link in
tasks_done[data.link]["result"]]
return JSONResponse({"result": links_to_download_video})
return JSONResponse(status_code=404, content={"result": "Задача не найдена"})
@@ -180,4 +183,25 @@ async def download_video(data: CheckIn, request: Request):
except Exception as ex:
print(ex)
+
+@app.delete('/from-s3/', status_code=200)
+async def delete_video_from_s3(delete_data: DeleteFromS3):
+ s3_client = S3Client()
+ s3_client.delete(delete_data.file_name)
+ return JSONResponse(
+ status_code=200,
+ content={"result": f"Файл {delete_data.file_name} успешно удален из корзины {s3_client.BUCKET_NAME}"}
+ )
+
+
+@app.post('/copy-to-another-bucket/', status_code=200)
+async def delete_video_from_s3(data: CopyToAnotherBucketS3):
+ s3_client = S3Client()
+ s3_client.copy_to_another_bucket(data.file_name, data.bucket_name)
+ return JSONResponse(
+ status_code=200,
+ content={"result": f"Файл {data.file_name} успешно скопирован в корзину {data.bucket_name}"}
+ )
+
+
uvicorn.run("src.web.main:app", host="0.0.0.0", log_level="info")
diff --git a/src/web/schemes/submit.py b/src/web/schemes/submit.py
index 629b69f..abc88d0 100644
--- a/src/web/schemes/submit.py
+++ b/src/web/schemes/submit.py
@@ -50,11 +50,20 @@ class MergeOutputFormatEnum(Enum):
@dataclass
class SubmitIn:
link: str = Form(...)
- video_format: VideoFormatEnum = Form(default=MergeOutputFormatEnum.format_webm)
- audio_format: AudioFormatEnum = Form(default=AudioFormatEnum.format_webm)
- resolution: ResolutionEnum = Form(default=ResolutionEnum.resolution_1080)
+ video_format: VideoFormatEnum = Form(default=MergeOutputFormatEnum.format_mp4)
+ audio_format: AudioFormatEnum = Form(default=AudioFormatEnum.format_m4a)
+ resolution: ResolutionEnum = Form(default=ResolutionEnum.resolution_720)
merge_output_format: MergeOutputFormatEnum = Form(default=MergeOutputFormatEnum.format_mkv)
class CheckIn(BaseModel):
link: str
+
+
+class DeleteFromS3(BaseModel):
+ file_name: str
+
+
+class CopyToAnotherBucketS3(BaseModel):
+ file_name: str
+ bucket_name: str
diff --git a/src/web/templates/index.html b/src/web/templates/index.html
index a965b0a..96ac597 100644
--- a/src/web/templates/index.html
+++ b/src/web/templates/index.html
@@ -77,9 +77,9 @@
-
+
-
+
Формат аудио
@@ -87,9 +87,9 @@
-
+
-
+
@@ -100,8 +100,8 @@
-
-
+
+