import logging import tempfile from io import BytesIO from os import PathLike from pathlib import PurePosixPath from typing import IO, DefaultDict, Dict, Optional, Set, Tuple, Union from urllib.parse import urlparse import repub.utils from repub import media from repub.exporters import RssExporter from scrapy.pipelines.files import FileException from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline from scrapy.utils.misc import md5sum logger = logging.getLogger(__name__) class ImagePipeline(BaseImagesPipeline): def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_image_path(request.url) def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None): raise NotImplementedError() class FilePipeline(BaseFilesPipeline): def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_file_path(request.url) class AudioPipeline(BaseFilesPipeline): def __init__(self, store_uri: Union[str, PathLike], **kwargs): self.FILES_URLS_FIELD = "audio_urls" self.FILES_RESULT_FIELD = "audios" store_uri = kwargs["settings"]["AUDIO_STORE"] super().__init__(store_uri, **kwargs) def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_audio_path(request.url) def file_downloaded(self, response, request, info, *, item=None): return self.audio_downloaded(response, request, info, item=item) def audio_downloaded(self, response, request, info, *, item=None): checksum = None for path, buf in self.get_audio(response, request, info, item=item): if checksum is None: buf.seek(0) checksum = md5sum(buf) # width, height = image.size self.store.persist_file( path, buf, info, # meta={"width": width, "height": height}, headers={"Content-Type": "audio/mp3"}, ) return checksum def get_audio(self, response, request, info, *, item=None): path = self.file_path(request, response=response, info=info, item=item) buf = BytesIO(response.body) with tempfile.TemporaryDirectory() as tmpdir: tmp_file = f"{tmpdir}/file" converted_file_base = f"{tmpdir}/converted" with open(tmp_file, "wb") as f: f.write(buf.read()) s = media.compression_settings(tmp_file, {}) if s is not None: converted_file = media.compress_audio(tmp_file, converted_file_base, s) buf_converted = BytesIO() with open(converted_file, "rb") as f: buf_converted.write(f.read()) buf_converted.seek(0) yield path, buf_converted else: logger.info( f"Skipping audio compression for {path}, it meets requirements" ) buf.seek(0) yield path, buf class VideoPipeline(BaseFilesPipeline): def __init__(self, store_uri: Union[str, PathLike], **kwargs): self.FILES_URLS_FIELD = "video_urls" self.FILES_RESULT_FIELD = "videos" store_uri = kwargs["settings"]["VIDEO_STORE"] super().__init__(store_uri, **kwargs) def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_video_path(request.url)