republisher/repub/pipelines.py

95 lines
3.5 KiB
Python

import logging
import tempfile
from io import BytesIO
from os import PathLike
from pathlib import PurePosixPath
from typing import IO, DefaultDict, Dict, Optional, Set, Tuple, Union
from urllib.parse import urlparse
import repub.utils
from repub import media
from repub.exporters import RssExporter
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
from scrapy.utils.misc import md5sum
logger = logging.getLogger(__name__)
class ImagePipeline(BaseImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
class FilePipeline(BaseFilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
class AudioPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "audio_urls"
self.FILES_RESULT_FIELD = "audios"
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
def file_downloaded(self, response, request, info, *, item=None):
return self.audio_downloaded(response, request, info, item=item)
def audio_downloaded(self, response, request, info, *, item=None):
checksum = None
for path, buf in self.get_audio(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
# width, height = image.size
self.store.persist_file(
path,
buf,
info,
# meta={"width": width, "height": height},
headers={"Content-Type": "audio/mp3"},
)
return checksum
def get_audio(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf = BytesIO(response.body)
with tempfile.TemporaryDirectory() as tmpdir:
tmp_file = f"{tmpdir}/file"
converted_file_base = f"{tmpdir}/converted"
with open(tmp_file, "wb") as f:
f.write(buf.read())
s = media.compression_settings(tmp_file, {})
if s is not None:
converted_file = media.compress_audio(tmp_file, converted_file_base, s)
buf_converted = BytesIO()
with open(converted_file, "rb") as f:
buf_converted.write(f.read())
buf_converted.seek(0)
yield path, buf_converted
else:
logger.info(
f"Skipping audio compression for {path}, it meets requirements"
)
buf.seek(0)
yield path, buf
class VideoPipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
self.FILES_URLS_FIELD = "video_urls"
self.FILES_RESULT_FIELD = "videos"
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(store_uri, **kwargs)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)