Fix Scrapy media pipeline initialization

This commit is contained in:
Abel Luck 2026-03-29 14:02:44 +02:00
parent 34d26f7def
commit 20b9759193
2 changed files with 71 additions and 19 deletions

View file

@ -4,9 +4,9 @@ from io import BytesIO
from os import PathLike
from typing import Dict, List, Optional, Union
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
import repub.utils
@ -24,12 +24,9 @@ class ImagePipeline(BaseImagesPipeline):
class FilePipeline(BaseFilesPipeline):
def __init__(self, store_uri, **kwargs):
settings = kwargs["settings"]
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
super().__init__(store_uri, **kwargs)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
@ -52,14 +49,12 @@ class TranscodePipeline(BaseFilesPipeline):
self,
media_type: repub.utils.FileType,
store_uri: Union[str, PathLike],
**kwargs,
*,
crawler: Crawler,
):
settings = kwargs["settings"]
self.media_type = media_type
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
super().__init__(store_uri, **kwargs)
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_downloaded(self, response, request, info, *, item=None):
return self.media_downloaded(response, request, info, item=item)
@ -139,9 +134,13 @@ class AudioPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "audio_urls"
DEFAULT_FILES_RESULT_FIELD = "audios"
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(repub.utils.FileType.AUDIO, store_uri, **kwargs)
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
@ -164,9 +163,13 @@ class VideoPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "video_urls"
DEFAULT_FILES_RESULT_FIELD = "videos"
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(repub.utils.FileType.VIDEO, store_uri, **kwargs)
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)