Fix Scrapy media pipeline initialization
This commit is contained in:
parent
34d26f7def
commit
20b9759193
2 changed files with 71 additions and 19 deletions
|
|
@ -4,9 +4,9 @@ from io import BytesIO
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
|
from scrapy.crawler import Crawler
|
||||||
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
||||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||||
from scrapy.settings import Settings
|
|
||||||
from scrapy.utils.misc import md5sum
|
from scrapy.utils.misc import md5sum
|
||||||
|
|
||||||
import repub.utils
|
import repub.utils
|
||||||
|
|
@ -24,12 +24,9 @@ class ImagePipeline(BaseImagesPipeline):
|
||||||
|
|
||||||
|
|
||||||
class FilePipeline(BaseFilesPipeline):
|
class FilePipeline(BaseFilesPipeline):
|
||||||
def __init__(self, store_uri, **kwargs):
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||||
settings = kwargs["settings"]
|
self.settings = crawler.settings
|
||||||
if isinstance(settings, dict) or settings is None:
|
super().__init__(store_uri, crawler=crawler)
|
||||||
settings = Settings(settings)
|
|
||||||
self.settings = settings
|
|
||||||
super().__init__(store_uri, **kwargs)
|
|
||||||
|
|
||||||
def file_path(self, request, response=None, info=None, *, item=None):
|
def file_path(self, request, response=None, info=None, *, item=None):
|
||||||
return repub.utils.local_file_path(request.url)
|
return repub.utils.local_file_path(request.url)
|
||||||
|
|
@ -52,14 +49,12 @@ class TranscodePipeline(BaseFilesPipeline):
|
||||||
self,
|
self,
|
||||||
media_type: repub.utils.FileType,
|
media_type: repub.utils.FileType,
|
||||||
store_uri: Union[str, PathLike],
|
store_uri: Union[str, PathLike],
|
||||||
**kwargs,
|
*,
|
||||||
|
crawler: Crawler,
|
||||||
):
|
):
|
||||||
settings = kwargs["settings"]
|
|
||||||
self.media_type = media_type
|
self.media_type = media_type
|
||||||
if isinstance(settings, dict) or settings is None:
|
self.settings = crawler.settings
|
||||||
settings = Settings(settings)
|
super().__init__(store_uri, crawler=crawler)
|
||||||
self.settings = settings
|
|
||||||
super().__init__(store_uri, **kwargs)
|
|
||||||
|
|
||||||
def file_downloaded(self, response, request, info, *, item=None):
|
def file_downloaded(self, response, request, info, *, item=None):
|
||||||
return self.media_downloaded(response, request, info, item=item)
|
return self.media_downloaded(response, request, info, item=item)
|
||||||
|
|
@ -139,9 +134,13 @@ class AudioPipeline(TranscodePipeline):
|
||||||
DEFAULT_FILES_URLS_FIELD = "audio_urls"
|
DEFAULT_FILES_URLS_FIELD = "audio_urls"
|
||||||
DEFAULT_FILES_RESULT_FIELD = "audios"
|
DEFAULT_FILES_RESULT_FIELD = "audios"
|
||||||
|
|
||||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
@classmethod
|
||||||
store_uri = kwargs["settings"]["AUDIO_STORE"]
|
def from_crawler(cls, crawler: Crawler):
|
||||||
super().__init__(repub.utils.FileType.AUDIO, store_uri, **kwargs)
|
cls._update_stores(crawler.settings)
|
||||||
|
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
|
||||||
|
|
||||||
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||||
|
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
|
||||||
|
|
||||||
def file_path(self, request, response=None, info=None, *, item=None):
|
def file_path(self, request, response=None, info=None, *, item=None):
|
||||||
return repub.utils.local_audio_path(request.url)
|
return repub.utils.local_audio_path(request.url)
|
||||||
|
|
@ -164,9 +163,13 @@ class VideoPipeline(TranscodePipeline):
|
||||||
DEFAULT_FILES_URLS_FIELD = "video_urls"
|
DEFAULT_FILES_URLS_FIELD = "video_urls"
|
||||||
DEFAULT_FILES_RESULT_FIELD = "videos"
|
DEFAULT_FILES_RESULT_FIELD = "videos"
|
||||||
|
|
||||||
def __init__(self, store_uri: Union[str, PathLike], **kwargs):
|
@classmethod
|
||||||
store_uri = kwargs["settings"]["VIDEO_STORE"]
|
def from_crawler(cls, crawler: Crawler):
|
||||||
super().__init__(repub.utils.FileType.VIDEO, store_uri, **kwargs)
|
cls._update_stores(crawler.settings)
|
||||||
|
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
|
||||||
|
|
||||||
|
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||||
|
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
|
||||||
|
|
||||||
def file_path(self, request, response=None, info=None, *, item=None):
|
def file_path(self, request, response=None, info=None, *, item=None):
|
||||||
return repub.utils.local_video_path(request.url)
|
return repub.utils.local_video_path(request.url)
|
||||||
|
|
|
||||||
49
tests/test_pipelines.py
Normal file
49
tests/test_pipelines.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from repub.config import (
|
||||||
|
FeedConfig,
|
||||||
|
RepublisherConfig,
|
||||||
|
build_base_settings,
|
||||||
|
build_feed_settings,
|
||||||
|
)
|
||||||
|
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
||||||
|
out_dir = (tmp_path / "mirror").resolve()
|
||||||
|
config = RepublisherConfig(
|
||||||
|
config_path=tmp_path / "repub.toml",
|
||||||
|
out_dir=out_dir,
|
||||||
|
feeds=(
|
||||||
|
FeedConfig(
|
||||||
|
name="nasa",
|
||||||
|
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
scrapy_settings={},
|
||||||
|
)
|
||||||
|
base_settings = build_base_settings(config)
|
||||||
|
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_name="nasa")
|
||||||
|
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("pipeline_cls", "store_setting"),
|
||||||
|
[
|
||||||
|
(AudioPipeline, "AUDIO_STORE"),
|
||||||
|
(VideoPipeline, "VIDEO_STORE"),
|
||||||
|
(FilePipeline, "FILES_STORE"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_pipeline_from_crawler_uses_configured_store(
|
||||||
|
tmp_path: Path, pipeline_cls, store_setting: str
|
||||||
|
) -> None:
|
||||||
|
crawler = build_test_crawler(tmp_path)
|
||||||
|
|
||||||
|
pipeline = pipeline_cls.from_crawler(crawler)
|
||||||
|
|
||||||
|
assert pipeline.settings is crawler.settings
|
||||||
|
assert pipeline.store.basedir == crawler.settings[store_setting]
|
||||||
Loading…
Add table
Add a link
Reference in a new issue