Fix Scrapy media pipeline initialization

This commit is contained in:
Abel Luck 2026-03-29 14:02:44 +02:00
parent 34d26f7def
commit 20b9759193
2 changed files with 71 additions and 19 deletions

View file

@ -4,9 +4,9 @@ from io import BytesIO
from os import PathLike
from typing import Dict, List, Optional, Union
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
import repub.utils
@ -24,12 +24,9 @@ class ImagePipeline(BaseImagesPipeline):
class FilePipeline(BaseFilesPipeline):
def __init__(self, store_uri, **kwargs):
settings = kwargs["settings"]
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
super().__init__(store_uri, **kwargs)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
@ -52,14 +49,12 @@ class TranscodePipeline(BaseFilesPipeline):
self,
media_type: repub.utils.FileType,
store_uri: Union[str, PathLike],
**kwargs,
*,
crawler: Crawler,
):
settings = kwargs["settings"]
self.media_type = media_type
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
super().__init__(store_uri, **kwargs)
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_downloaded(self, response, request, info, *, item=None):
return self.media_downloaded(response, request, info, item=item)
@ -139,9 +134,13 @@ class AudioPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "audio_urls"
DEFAULT_FILES_RESULT_FIELD = "audios"
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
store_uri = kwargs["settings"]["AUDIO_STORE"]
super().__init__(repub.utils.FileType.AUDIO, store_uri, **kwargs)
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_audio_path(request.url)
@ -164,9 +163,13 @@ class VideoPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "video_urls"
DEFAULT_FILES_RESULT_FIELD = "videos"
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
store_uri = kwargs["settings"]["VIDEO_STORE"]
super().__init__(repub.utils.FileType.VIDEO, store_uri, **kwargs)
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_video_path(request.url)

49
tests/test_pipelines.py Normal file
View file

@ -0,0 +1,49 @@
from pathlib import Path
from types import SimpleNamespace
import pytest
from repub.config import (
FeedConfig,
RepublisherConfig,
build_base_settings,
build_feed_settings,
)
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
),
),
scrapy_settings={},
)
base_settings = build_base_settings(config)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
return SimpleNamespace(settings=settings, request_fingerprinter=object())
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
(AudioPipeline, "AUDIO_STORE"),
(VideoPipeline, "VIDEO_STORE"),
(FilePipeline, "FILES_STORE"),
],
)
def test_pipeline_from_crawler_uses_configured_store(
tmp_path: Path, pipeline_cls, store_setting: str
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = pipeline_cls.from_crawler(crawler)
assert pipeline.settings is crawler.settings
assert pipeline.store.basedir == crawler.settings[store_setting]