Fix Scrapy media pipeline initialization
This commit is contained in:
parent
34d26f7def
commit
20b9759193
2 changed files with 71 additions and 19 deletions
|
|
@ -4,9 +4,9 @@ from io import BytesIO
|
|||
from os import PathLike
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import md5sum
|
||||
|
||||
import repub.utils
|
||||
|
|
@ -24,12 +24,9 @@ class ImagePipeline(BaseImagesPipeline):
|
|||
|
||||
|
||||
class FilePipeline(BaseFilesPipeline):
|
||||
def __init__(self, store_uri, **kwargs):
|
||||
settings = kwargs["settings"]
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
self.settings = settings
|
||||
super().__init__(store_uri, **kwargs)
|
||||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
self.settings = crawler.settings
|
||||
super().__init__(store_uri, crawler=crawler)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_file_path(request.url)
|
||||
|
|
@ -52,14 +49,12 @@ class TranscodePipeline(BaseFilesPipeline):
|
|||
self,
|
||||
media_type: repub.utils.FileType,
|
||||
store_uri: Union[str, PathLike],
|
||||
**kwargs,
|
||||
*,
|
||||
crawler: Crawler,
|
||||
):
|
||||
settings = kwargs["settings"]
|
||||
self.media_type = media_type
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
self.settings = settings
|
||||
super().__init__(store_uri, **kwargs)
|
||||
self.settings = crawler.settings
|
||||
super().__init__(store_uri, crawler=crawler)
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
return self.media_downloaded(response, request, info, item=item)
|
||||
|
|
@ -139,9 +134,13 @@ class AudioPipeline(TranscodePipeline):
|
|||
DEFAULT_FILES_URLS_FIELD = "audio_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "audios"
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
store_uri = kwargs["settings"]["AUDIO_STORE"]
|
||||
super().__init__(repub.utils.FileType.AUDIO, store_uri, **kwargs)
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler):
|
||||
cls._update_stores(crawler.settings)
|
||||
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_audio_path(request.url)
|
||||
|
|
@ -164,9 +163,13 @@ class VideoPipeline(TranscodePipeline):
|
|||
DEFAULT_FILES_URLS_FIELD = "video_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "videos"
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
||||
store_uri = kwargs["settings"]["VIDEO_STORE"]
|
||||
super().__init__(repub.utils.FileType.VIDEO, store_uri, **kwargs)
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler):
|
||||
cls._update_stores(crawler.settings)
|
||||
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_video_path(request.url)
|
||||
|
|
|
|||
49
tests/test_pipelines.py
Normal file
49
tests/test_pipelines.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from repub.config import (
|
||||
FeedConfig,
|
||||
RepublisherConfig,
|
||||
build_base_settings,
|
||||
build_feed_settings,
|
||||
)
|
||||
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
|
||||
|
||||
|
||||
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
||||
out_dir = (tmp_path / "mirror").resolve()
|
||||
config = RepublisherConfig(
|
||||
config_path=tmp_path / "repub.toml",
|
||||
out_dir=out_dir,
|
||||
feeds=(
|
||||
FeedConfig(
|
||||
name="nasa",
|
||||
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||
),
|
||||
),
|
||||
scrapy_settings={},
|
||||
)
|
||||
base_settings = build_base_settings(config)
|
||||
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
|
||||
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pipeline_cls", "store_setting"),
|
||||
[
|
||||
(AudioPipeline, "AUDIO_STORE"),
|
||||
(VideoPipeline, "VIDEO_STORE"),
|
||||
(FilePipeline, "FILES_STORE"),
|
||||
],
|
||||
)
|
||||
def test_pipeline_from_crawler_uses_configured_store(
|
||||
tmp_path: Path, pipeline_cls, store_setting: str
|
||||
) -> None:
|
||||
crawler = build_test_crawler(tmp_path)
|
||||
|
||||
pipeline = pipeline_cls.from_crawler(crawler)
|
||||
|
||||
assert pipeline.settings is crawler.settings
|
||||
assert pipeline.store.basedir == crawler.settings[store_setting]
|
||||
Loading…
Add table
Add a link
Reference in a new issue