From 20b97591934e541d4e45d420ff5a4271470f15c5 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Sun, 29 Mar 2026 14:02:44 +0200 Subject: [PATCH] Fix Scrapy media pipeline initialization --- repub/pipelines.py | 41 ++++++++++++++++++---------------- tests/test_pipelines.py | 49 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 19 deletions(-) create mode 100644 tests/test_pipelines.py diff --git a/repub/pipelines.py b/repub/pipelines.py index 0147a20..4f14f19 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -4,9 +4,9 @@ from io import BytesIO from os import PathLike from typing import Dict, List, Optional, Union +from scrapy.crawler import Crawler from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline -from scrapy.settings import Settings from scrapy.utils.misc import md5sum import repub.utils @@ -24,12 +24,9 @@ class ImagePipeline(BaseImagesPipeline): class FilePipeline(BaseFilesPipeline): - def __init__(self, store_uri, **kwargs): - settings = kwargs["settings"] - if isinstance(settings, dict) or settings is None: - settings = Settings(settings) - self.settings = settings - super().__init__(store_uri, **kwargs) + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + self.settings = crawler.settings + super().__init__(store_uri, crawler=crawler) def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_file_path(request.url) @@ -52,14 +49,12 @@ class TranscodePipeline(BaseFilesPipeline): self, media_type: repub.utils.FileType, store_uri: Union[str, PathLike], - **kwargs, + *, + crawler: Crawler, ): - settings = kwargs["settings"] self.media_type = media_type - if isinstance(settings, dict) or settings is None: - settings = Settings(settings) - self.settings = settings - super().__init__(store_uri, **kwargs) + self.settings = crawler.settings + super().__init__(store_uri, crawler=crawler) def file_downloaded(self, response, request, info, *, item=None): return self.media_downloaded(response, request, info, item=item) @@ -139,9 +134,13 @@ class AudioPipeline(TranscodePipeline): DEFAULT_FILES_URLS_FIELD = "audio_urls" DEFAULT_FILES_RESULT_FIELD = "audios" - def __init__(self, store_uri: Union[str, PathLike], **kwargs): - store_uri = kwargs["settings"]["AUDIO_STORE"] - super().__init__(repub.utils.FileType.AUDIO, store_uri, **kwargs) + @classmethod + def from_crawler(cls, crawler: Crawler): + cls._update_stores(crawler.settings) + return cls(crawler.settings["AUDIO_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler) def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_audio_path(request.url) @@ -164,9 +163,13 @@ class VideoPipeline(TranscodePipeline): DEFAULT_FILES_URLS_FIELD = "video_urls" DEFAULT_FILES_RESULT_FIELD = "videos" - def __init__(self, store_uri: Union[str, PathLike], **kwargs): - store_uri = kwargs["settings"]["VIDEO_STORE"] - super().__init__(repub.utils.FileType.VIDEO, store_uri, **kwargs) + @classmethod + def from_crawler(cls, crawler: Crawler): + cls._update_stores(crawler.settings) + return cls(crawler.settings["VIDEO_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler) def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_video_path(request.url) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py new file mode 100644 index 0000000..1bc27f2 --- /dev/null +++ b/tests/test_pipelines.py @@ -0,0 +1,49 @@ +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from repub.config import ( + FeedConfig, + RepublisherConfig, + build_base_settings, + build_feed_settings, +) +from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline + + +def build_test_crawler(tmp_path: Path) -> SimpleNamespace: + out_dir = (tmp_path / "mirror").resolve() + config = RepublisherConfig( + config_path=tmp_path / "repub.toml", + out_dir=out_dir, + feeds=( + FeedConfig( + name="nasa", + url="https://www.nasa.gov/rss/dyn/breaking_news.rss", + ), + ), + scrapy_settings={}, + ) + base_settings = build_base_settings(config) + settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa") + return SimpleNamespace(settings=settings, request_fingerprinter=object()) + + +@pytest.mark.parametrize( + ("pipeline_cls", "store_setting"), + [ + (AudioPipeline, "AUDIO_STORE"), + (VideoPipeline, "VIDEO_STORE"), + (FilePipeline, "FILES_STORE"), + ], +) +def test_pipeline_from_crawler_uses_configured_store( + tmp_path: Path, pipeline_cls, store_setting: str +) -> None: + crawler = build_test_crawler(tmp_path) + + pipeline = pipeline_cls.from_crawler(crawler) + + assert pipeline.settings is crawler.settings + assert pipeline.store.basedir == crawler.settings[store_setting]