republisher/repub/pipelines.py

96 lines
3.5 KiB
Python
Raw Normal View History

2024-04-18 17:28:09 +02:00
import logging
import tempfile
from io import BytesIO
from os import PathLike
from pathlib import PurePosixPath
2024-04-18 17:28:09 +02:00
from typing import IO, DefaultDict, Dict, Optional, Set, Tuple, Union
from urllib.parse import urlparse
2024-04-18 11:57:24 +02:00
import repub.utils
2024-04-18 17:28:09 +02:00
from repub import media
from repub.exporters import RssExporter
2024-04-18 17:28:09 +02:00
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
2024-04-18 17:28:09 +02:00
from scrapy.utils.misc import md5sum
logger = logging.getLogger(__name__)
2024-04-18 11:57:24 +02:00
class ImagePipeline(BaseImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
2024-04-18 11:57:24 +02:00
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
2024-04-18 11:57:24 +02:00
class FilePipeline(BaseFilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
2024-04-18 11:57:24 +02:00
class AudioPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "audio_urls"
self.FILES_RESULT_FIELD = "audios"
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(store_uri, **kwargs)
2024-04-18 11:57:24 +02:00
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
2024-04-18 11:57:24 +02:00
2024-04-18 17:28:09 +02:00
def file_downloaded(self, response, request, info, *, item=None):
return self.audio_downloaded(response, request, info, item=item)
def audio_downloaded(self, response, request, info, *, item=None):
checksum = None
for path, buf in self.get_audio(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
# width, height = image.size
self.store.persist_file(
path,
buf,
info,
# meta={"width": width, "height": height},
headers={"Content-Type": "audio/mp3"},
)
return checksum
def get_audio(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf = BytesIO(response.body)
with tempfile.TemporaryDirectory() as tmpdir:
tmp_file = f"{tmpdir}/file"
converted_file_base = f"{tmpdir}/converted"
with open(tmp_file, "wb") as f:
f.write(buf.read())
s = media.compression_settings(tmp_file, {})
if s is not None:
converted_file = media.compress_audio(tmp_file, converted_file_base, s)
buf_converted = BytesIO()
with open(converted_file, "rb") as f:
buf_converted.write(f.read())
buf_converted.seek(0)
yield path, buf_converted
else:
logger.info(
f"Skipping audio compression for {path}, it meets requirements"
)
buf.seek(0)
yield path, buf
2024-04-18 11:57:24 +02:00
class VideoPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "video_urls"
self.FILES_RESULT_FIELD = "videos"
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(store_uri, **kwargs)
2024-04-18 11:57:24 +02:00
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)