From 89d462e280673b68a879789c2fdce773f1511ed7 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Tue, 31 Mar 2026 14:14:46 +0200 Subject: [PATCH] Fix published paths for transcoded media --- repub/exporters.py | 144 +++++++++++++- repub/items.py | 33 +++- repub/media.py | 75 +++++--- repub/pipelines.py | 236 +++++++++++++++++------ repub/spiders/rss_spider.py | 18 +- repub/utils.py | 26 ++- tests/test_feed_validation.py | 180 ++++++++++++++++-- tests/test_file_feeds.py | 16 +- tests/test_pipelines.py | 342 +++++++++++++++++++++++++++++++++- 9 files changed, 956 insertions(+), 114 deletions(-) diff --git a/repub/exporters.py b/repub/exporters.py index dc42ba7..99b0663 100644 --- a/repub/exporters.py +++ b/repub/exporters.py @@ -1,11 +1,20 @@ from io import BytesIO from typing import Any +from lxml.etree import QName from scrapy.exporters import BaseItemExporter from repub import rss +from repub.items import ( + ChannelElementItem, + ElementItem, + MediaVariant, + TranscodedMediaFile, +) +from repub.utils import FileType, determine_file_type -from .items import ChannelElementItem +MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text +MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text class RssExporter(BaseItemExporter): @@ -38,8 +47,141 @@ class RssExporter(BaseItemExporter): self.export_rss_item(item) self.item_buffer = [] + def compact_attrib(self, **attrib): + return { + key: str(value) for key, value in attrib.items() if value not in (None, "") + } + + def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None: + for variant in media_file["variants"]: + if variant.get("isDefault") == "true": + return variant + if media_file["variants"]: + return media_file["variants"][0] + return None + + def rebuild_enclosures(self, item: ElementItem) -> None: + audio_lookup = {audio["published_url"]: audio for audio in item.audios} + for enclosure in item.el.findall("enclosure"): + media_file = audio_lookup.get(enclosure.get("url", "")) + if media_file is None: + continue + canonical = self.canonical_variant(media_file) + if canonical is None: + continue + enclosure.attrib.clear() + enclosure.attrib.update( + self.compact_attrib( + url=canonical.get("url"), + length=canonical.get("fileSize") or enclosure.get("length"), + type=canonical.get("type") or enclosure.get("type"), + ) + ) + + def owned_media_type(self, el, managed_types: set[FileType]) -> FileType | None: + url = el.get("url", "") + file_type = determine_file_type( + url=url, + medium=el.get("medium"), + mimetype=el.get("type"), + ) + if file_type in managed_types: + return file_type + return None + + def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]: + fallbacks: dict[str, dict[str, str]] = {} + managed_types: set[FileType] = set() + if item.audios: + managed_types.add(FileType.AUDIO) + if item.videos: + managed_types.add(FileType.VIDEO) + if not managed_types: + return fallbacks + + for child in list(item.el): + if child.tag == MEDIA_CONTENT_TAG: + if self.owned_media_type(child, managed_types) is None: + continue + fallbacks[child.get("url", "")] = { + key: value + for key, value in child.attrib.items() + if key in {"expression", "lang"} + } + item.el.remove(child) + continue + + if child.tag != MEDIA_GROUP_TAG: + continue + for media_content in list(child): + if media_content.tag != MEDIA_CONTENT_TAG: + continue + if self.owned_media_type(media_content, managed_types) is None: + continue + fallbacks[media_content.get("url", "")] = { + key: value + for key, value in media_content.attrib.items() + if key in {"expression", "lang"} + } + child.remove(media_content) + if len(child) == 0: + item.el.remove(child) + return fallbacks + + def append_media_groups( + self, item: ElementItem, fallbacks: dict[str, dict[str, str]] + ): + for media_file in [*item.audios, *item.videos]: + if not media_file["variants"]: + continue + fallback_attrib = fallbacks.get(media_file["published_url"], {}) + group = rss.MEDIA.group( + *[ + rss.MEDIA.content( + **self.media_content_attrib(variant, fallback_attrib) + ) + for variant in media_file["variants"] + ] + ) + if group is not None: + item.el.append(group) + + def media_content_attrib( + self, variant: MediaVariant, fallback_attrib: dict[str, str] + ) -> dict[str, str]: + attrib = dict(fallback_attrib) + attrib.update( + self.compact_attrib( + url=variant.get("url"), + type=variant.get("type"), + medium=variant.get("medium"), + isDefault=variant.get("isDefault"), + expression=variant.get("expression"), + bitrate=variant.get("bitrate"), + framerate=variant.get("framerate"), + samplingrate=variant.get("samplingrate"), + channels=variant.get("channels"), + duration=variant.get("duration"), + height=variant.get("height"), + width=variant.get("width"), + lang=variant.get("lang"), + fileSize=variant.get("fileSize"), + ) + ) + return attrib + + def apply_transcoded_media(self, item: Any) -> None: + if not isinstance(item, ElementItem): + return + if not item.audios and not item.videos: + return + self.rebuild_enclosures(item) + fallbacks = self.strip_managed_media_nodes(item) + self.append_media_groups(item, fallbacks) + def export_rss_item(self, item: Any): assert self.channel is not None + self.apply_transcoded_media(item) self.channel.append(item.el) def finish_exporting(self) -> None: diff --git a/repub/items.py b/repub/items.py index 748858e..d5e77be 100644 --- a/repub/items.py +++ b/repub/items.py @@ -1,5 +1,32 @@ from dataclasses import dataclass -from typing import Any, List +from typing import Any, List, TypedDict + + +class MediaVariant(TypedDict, total=False): + url: str + path: str + type: str + medium: str + isDefault: str + fileSize: str + bitrate: int | float | str + samplingrate: int | str + channels: int | str + duration: str + width: int | str + height: int | str + framerate: str + expression: str + lang: str + + +class TranscodedMediaFile(TypedDict): + url: str + path: str + checksum: str | None + status: str + published_url: str + variants: List[MediaVariant] @dataclass @@ -11,9 +38,9 @@ class ElementItem: file_urls: List[str] files: List[Any] audio_urls: List[str] - audios: List[Any] + audios: List[TranscodedMediaFile] video_urls: List[str] - videos: List[Any] + videos: List[TranscodedMediaFile] @dataclass diff --git a/repub/media.py b/repub/media.py index 53499cc..54165fa 100644 --- a/repub/media.py +++ b/repub/media.py @@ -33,25 +33,21 @@ class VideoSettings(MediaSettings): ffmpeg_video_params: Dict[str, str] -class AudioMeta(TypedDict): - format_name: str - format_long_name: str +class AudioMeta(TypedDict, total=False): duration: str - bit_rate: float - size: str + fileSize: str + bitrate: int + samplingrate: int + channels: int -class VideoMeta(TypedDict): +class VideoMeta(TypedDict, total=False): duration: str - size: str - format_name: str - format_long_name: str + fileSize: str width: int height: int - codec_name: str - display_aspect_ratio: str - duration_ts: float - bit_rate: float + bitrate: int + framerate: str def _decode_ffmpeg_output(output: Any) -> str: @@ -157,32 +153,51 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]: return None, None +def _int_value(value: Any) -> Optional[int]: + try: + if value in (None, ""): + return None + return int(str(value)) + except (TypeError, ValueError): + return None + + +def _frame_rate(stream: Dict[str, Any]) -> Optional[str]: + for key in ("avg_frame_rate", "r_frame_rate"): + value = stream.get(key) + if value not in (None, "", "0/0"): + return str(value) + return None + + def audio_meta(probe: Dict[str, Any]) -> Optional[AudioMeta]: - return AudioMeta( - duration=probe["format"].get("duration", ""), - size=probe["format"].get("size", ""), - format_name=probe["format"].get("format_name", ""), - format_long_name=probe["format"].get("format_long_name", ""), - bit_rate=float(probe["format"].get("bit_rate", 0.0)), + stream = primary_audio_stream(probe) + if not stream: + return None + meta = AudioMeta( + duration=str(probe["format"].get("duration", "")), + fileSize=str(probe["format"].get("size", "")), + bitrate=_int_value(probe["format"].get("bit_rate")) or 0, + samplingrate=_int_value(stream.get("sample_rate")) or 0, + channels=_int_value(stream.get("channels")) or 0, ) + return {key: value for key, value in meta.items() if value not in ("", 0)} def video_meta(probe: Dict[str, Any]) -> Optional[VideoMeta]: stream = primary_video_stream(probe) if not stream: return None - return VideoMeta( - duration=probe["format"].get("duration", ""), - size=probe["format"].get("size", ""), - format_name=probe["format"].get("format_name", ""), - format_long_name=probe["format"].get("format_long_name", ""), - width=int(stream.get("width", 0)), - height=int(stream.get("height", 0)), - codec_name=stream.get("codec_name", ""), - display_aspect_ratio=stream.get("display_aspect_ratio", ""), - duration_ts=float(stream.get("duration_ts", 0.0)), - bit_rate=float(stream.get("bit_rate", 0.0)), + meta = VideoMeta( + duration=str(probe["format"].get("duration", "")), + fileSize=str(probe["format"].get("size", "")), + width=_int_value(stream.get("width")) or 0, + height=_int_value(stream.get("height")) or 0, + bitrate=_int_value(stream.get("bit_rate") or probe["format"].get("bit_rate")) + or 0, + framerate=_frame_rate(stream) or "", ) + return {key: value for key, value in meta.items() if value not in ("", 0)} def audio_transcode_params( diff --git a/repub/pipelines.py b/repub/pipelines.py index 4f14f19..03d147d 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,16 +1,20 @@ +import hashlib import logging import tempfile +import time from io import BytesIO from os import PathLike -from typing import Dict, List, Optional, Union +from pathlib import Path +from typing import Any, Dict, List, Optional, Union, cast from scrapy.crawler import Crawler +from scrapy.pipelines.files import FileException from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline -from scrapy.utils.misc import md5sum import repub.utils from repub import media +from repub.items import MediaVariant, TranscodedMediaFile logger = logging.getLogger(__name__) @@ -32,7 +36,7 @@ class FilePipeline(BaseFilesPipeline): return repub.utils.local_file_path(request.url) -def read_asset(file_path) -> BytesIO: +def read_asset(file_path: str | Path) -> BytesIO: buf_converted = BytesIO() with open(file_path, "rb") as f: buf_converted.write(f.read()) @@ -40,8 +44,11 @@ def read_asset(file_path) -> BytesIO: return buf_converted -def media_final_path(base_path, name, ext): - return f"{base_path}-{name}.{ext}" +def buffer_checksum(buf: BytesIO) -> str: + buf.seek(0) + checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec + buf.seek(0) + return checksum class TranscodePipeline(BaseFilesPipeline): @@ -56,37 +63,17 @@ class TranscodePipeline(BaseFilesPipeline): self.settings = crawler.settings super().__init__(store_uri, crawler=crawler) - def file_downloaded(self, response, request, info, *, item=None): - return self.media_downloaded(response, request, info, item=item) - - def media_downloaded(self, response, request, info, *, item=None): - checksum = None - for path, buf, meta, mime in self.get_media(response, request, info, item=item): - if checksum is None: - buf.seek(0) - checksum = md5sum(buf) - self.store.persist_file( - path, - buf, - info, - meta=meta, - headers={"Content-Type": mime}, - ) - return checksum - def transcode( self, input_file: str, settings: media.MediaSettings, tmp_dir: str ) -> Optional[str]: probe_result = media.probe_media(input_file) params = self.get_transcode_params(probe_result, settings) if params is not None: - converted_file = self.transcode_media(input_file, tmp_dir, params) - return converted_file - else: - logger.info( - f"Skipping audio compression for {input_file}, it meets requirements" - ) - return None + return self.transcode_media(input_file, tmp_dir, params) + logger.info( + f"Skipping audio compression for {input_file}, it meets requirements" + ) + return None def get_media_settings(self) -> List[media.MediaSettings]: raise NotImplementedError() @@ -100,37 +87,181 @@ class TranscodePipeline(BaseFilesPipeline): def get_media_meta(self, probe_result) -> media.MediaMeta: raise NotImplementedError() - def get_media(self, response, request, info, *, item=None): - buf = BytesIO(response.body) - base_path = self.file_path(request, response=response, info=info, item=item) + def media_dir(self) -> str: + setting_name = { + repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR", + repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR", + }.get(self.media_type) + if setting_name is None: + raise ValueError(f"Unsupported media type: {self.media_type}") + return self.settings[setting_name] + + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.canonical_published_media_path( + self.media_type, + request.url, + self.get_media_settings(), + ) + + def variant_paths( + self, source_url: str + ) -> list[tuple[bool, media.MediaSettings, str]]: + settings = self.get_media_settings() + return [ + ( + index == 0, + setting, + repub.utils.published_media_path(self.media_type, source_url, setting), + ) + for index, setting in enumerate(settings) + ] + + def published_url(self, path: str, item=None) -> str: + relative_path = f"{self.media_dir()}/{path}" + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "" or item is None: + return relative_path + return f"{feed_url}/feeds/{item.feed_name}/{relative_path}" + + def local_store_path(self, path: str) -> Path: + return Path(cast(Any, self.store).basedir) / path + + def media_variant( + self, + *, + path: str, + setting: media.MediaSettings, + probe_result: dict[str, Any], + is_default: bool, + item=None, + ) -> MediaVariant: + variant: MediaVariant = { + "url": self.published_url(path, item), + "path": path, + "type": setting["mimetype"], + "medium": self.media_type.value, + "isDefault": "true" if is_default else "false", + } + meta = self.get_media_meta(probe_result) or {} + for key, value in meta.items(): + if value not in (None, ""): + variant[key] = value + return variant + + def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]: + variants: list[MediaVariant] = [] + for is_default, setting, path in self.variant_paths(request.url): + file_path = self.local_store_path(path) + if not file_path.exists(): + continue + probe_result = media.probe_media(str(file_path)) + variants.append( + self.media_variant( + path=path, + setting=setting, + probe_result=probe_result, + is_default=is_default, + item=item, + ) + ) + return variants + + def make_file_result( + self, + request, + *, + checksum: str | None, + status: str, + item=None, + ) -> TranscodedMediaFile: + path = self.file_path(request, item=item) + return { + "url": request.url, + "path": path, + "published_url": self.published_url(path, item), + "checksum": checksum, + "status": status, + "variants": self.load_variants_from_disk(request, item=item), + } + + def media_to_download(self, request, info, *, item=None): + canonical_path = self.file_path(request, info=info, item=item) + canonical_stat = cast( + dict[str, Any] | None, + self.store.stat_file(canonical_path, info), + ) + if not canonical_stat: + return None + last_modified = canonical_stat.get("last_modified") + if not last_modified: + return None + age_days = (time.time() - last_modified) / 60 / 60 / 24 + if age_days > self.expires: + return None + for _, _, path in self.variant_paths(request.url): + if not cast(dict[str, Any] | None, self.store.stat_file(path, info)): + return None + self.inc_stats("uptodate") + return self.make_file_result( + request, + checksum=canonical_stat.get("checksum"), + status="uptodate", + item=item, + ) + + def persist_variants(self, response, request, info, *, item=None) -> str | None: + canonical_path = self.file_path( + request, response=response, info=info, item=item + ) + canonical_checksum = None with tempfile.TemporaryDirectory() as tmp_dir: - settings = self.get_media_settings() tmp_file = f"{tmp_dir}/original" with open(tmp_file, "wb") as f: - f.write(buf.read()) - for setting in settings: - ext = setting["extension"] - name = setting["name"] - final_path = media_final_path(base_path, name, ext) - stat = self.store.stat_file(final_path, info) + f.write(response.body) + for _, setting, final_path in self.variant_paths(request.url): + stat = cast( + dict[str, Any] | None, + self.store.stat_file(final_path, info), + ) if stat: logger.info(f"Skipping, transcoded media exists at {final_path}") + if final_path == canonical_path: + canonical_checksum = stat.get("checksum") continue - converted_file = self.transcode(tmp_file, setting, tmp_dir) - if converted_file: - out_buf = read_asset(converted_file) - out_file = converted_file - else: - out_buf = buf - out_file = tmp_file + out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file + out_buf = read_asset(out_file) probe_result = media.probe_media(out_file) meta = self.get_media_meta(probe_result) logger.info(f"{self.media_type} final {final_path} with {meta}") - yield final_path, out_buf, meta, setting["mimetype"] + checksum = buffer_checksum(out_buf) + self.store.persist_file( + final_path, + out_buf, + info, + meta=meta, + headers={"Content-Type": setting["mimetype"]}, + ) + if final_path == canonical_path: + canonical_checksum = checksum + return canonical_checksum + + def media_downloaded(self, response, request, info, *, item=None): + if response.status != 200: + raise FileException("download-error") + if not response.body: + raise FileException("empty-content") + status = "cached" if "cached" in response.flags else "downloaded" + self.inc_stats(status) + checksum = self.persist_variants(response, request, info, item=item) + return self.make_file_result( + request, + checksum=checksum, + status=status, + item=item, + ) class AudioPipeline(TranscodePipeline): - DEFAULT_FILES_URLS_FIELD = "audio_urls" DEFAULT_FILES_RESULT_FIELD = "audios" @@ -142,9 +273,6 @@ class AudioPipeline(TranscodePipeline): def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler) - def file_path(self, request, response=None, info=None, *, item=None): - return repub.utils.local_audio_path(request.url) - def get_media_settings(self) -> List[media.AudioSettings]: return self.settings["REPUBLISHER_AUDIO"] @@ -159,7 +287,6 @@ class AudioPipeline(TranscodePipeline): class VideoPipeline(TranscodePipeline): - DEFAULT_FILES_URLS_FIELD = "video_urls" DEFAULT_FILES_RESULT_FIELD = "videos" @@ -171,9 +298,6 @@ class VideoPipeline(TranscodePipeline): def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler) - def file_path(self, request, response=None, info=None, *, item=None): - return repub.utils.local_video_path(request.url) - def get_media_settings(self) -> List[media.VideoSettings]: return self.settings["REPUBLISHER_VIDEO"] diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 409794e..80be20e 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -19,7 +19,13 @@ from repub.rss import ( plain_text_summary, sanitize_html, ) -from repub.utils import FileType, determine_file_type, local_file_path, local_image_path +from repub.utils import ( + FileType, + canonical_published_media_path, + determine_file_type, + local_file_path, + local_image_path, +) class BaseRssFeedSpider(Spider): @@ -51,8 +57,18 @@ class BaseRssFeedSpider(Spider): local_path = local_image_path(url) elif file_type == FileType.VIDEO: file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] + local_path = canonical_published_media_path( + FileType.VIDEO, + url, + self.settings["REPUBLISHER_VIDEO"], + ) elif file_type == FileType.AUDIO: file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] + local_path = canonical_published_media_path( + FileType.AUDIO, + url, + self.settings["REPUBLISHER_AUDIO"], + ) relative_path = f"{file_dir}/{local_path}" return self.absolute_feed_url(relative_path) diff --git a/repub/utils.py b/repub/utils.py index e747391..d10b920 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -2,7 +2,7 @@ import hashlib import mimetypes from enum import Enum from pathlib import Path -from typing import Optional +from typing import Any, Mapping, Optional, Sequence from scrapy.utils.python import to_bytes @@ -42,6 +42,30 @@ def local_audio_path(s: str) -> str: return local_file_path(s) +def variant_media_path(base_path: str, profile: Mapping[str, Any]) -> str: + return f"{base_path}-{profile['name']}.{profile['extension']}" + + +def published_media_path( + file_type: FileType, source_url: str, profile: Mapping[str, Any] +) -> str: + if file_type == FileType.AUDIO: + return variant_media_path(local_audio_path(source_url), profile) + if file_type == FileType.VIDEO: + return variant_media_path(local_video_path(source_url), profile) + raise ValueError(f"Unsupported file type for published media path: {file_type}") + + +def canonical_published_media_path( + file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] +) -> str: + if not profiles: + raise ValueError(f"Missing transcode profiles for {file_type.value}") + # The first configured profile is the public URL contract. Reordering profiles + # changes published URLs for already-mirrored media. + return published_media_path(file_type, source_url, profiles[0]) + + def determine_file_type( url: str, medium: Optional[str] = None, mimetype: Optional[str] = None ): diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 6aacdb9..ad3446a 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -3,22 +3,34 @@ from __future__ import annotations import re from email.utils import parsedate_to_datetime from io import BytesIO +from typing import Callable import lxml.etree as etree from scrapy.http import TextResponse from scrapy.settings import Settings +from repub import settings as repub_settings from repub.exporters import RssExporter +from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider -from repub.utils import local_audio_path, local_file_path, local_image_path +from repub.utils import local_audio_path, local_image_path, local_video_path RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" ) -def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]: +def _published_url(feed_url: str, path: str) -> str: + return f"{feed_url}/feeds/demo/{path}" + + +def _serialize_feed( + *, + feed_text: str, + feed_url: str, + prepare_item: Callable[[ElementItem], None] | None = None, +) -> tuple[str, etree._Element]: spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss") spider.settings = Settings( values={ @@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, + "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, } ) @@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme exporter = RssExporter(output) exporter.start_exporting() for item in list(spider._parse(response) or []): + if prepare_item is not None and isinstance(item, ElementItem): + prepare_item(item) exporter.export_item(item) exporter.finish_exporting() @@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" + + def prepare_item(item: ElementItem) -> None: + audio_base_path = local_audio_path(source_audio) + video_base_path = local_video_path(source_video) + item.audios = [ + { + "url": source_audio, + "path": f"{audio_base_path}-vbr7.mp3", + "published_url": _published_url( + "https://mirror.example", + f"audio/{audio_base_path}-vbr7.mp3", + ), + "checksum": "audio-default", + "status": "downloaded", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"audio/{audio_base_path}-vbr7.mp3", + ), + "path": f"{audio_base_path}-vbr7.mp3", + "type": "audio/mp3", + "medium": "audio", + "isDefault": "true", + "fileSize": "4567", + "bitrate": "96000", + "duration": "61.2", + "samplingrate": "44100", + "channels": "2", + }, + { + "url": _published_url( + "https://mirror.example", + f"audio/{audio_base_path}-vbr3.aac", + ), + "path": f"{audio_base_path}-vbr3.aac", + "type": "audio/aac", + "medium": "audio", + "isDefault": "false", + "fileSize": "3456", + "bitrate": "88000", + "duration": "61.2", + "samplingrate": "48000", + "channels": "2", + }, + ], + } + ] + item.videos = [ + { + "url": source_video, + "path": f"{video_base_path}-720.mp4", + "published_url": _published_url( + "https://mirror.example", + f"video/{video_base_path}-720.mp4", + ), + "checksum": "video-default", + "status": "downloaded", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"video/{video_base_path}-720.mp4", + ), + "path": f"{video_base_path}-720.mp4", + "type": "video/mp4", + "medium": "video", + "isDefault": "true", + "fileSize": "9876", + "bitrate": "123456", + "duration": "60.0", + "width": "1280", + "height": "720", + "framerate": "30/1", + } + ], + } + ] + xml, root = _serialize_feed( feed_url="https://mirror.example", + prepare_item=prepare_item, feed_text=f""" None: enclosure = root.find("./channel/item/enclosure") assert enclosure is not None assert enclosure.attrib == { - "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}", - "length": "123", - "type": "audio/mpeg", + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr7.mp3" + ), + "length": "4567", + "type": "audio/mp3", } assert len(enclosure) == 0 - media_content = root.find("./channel/item/media:content", namespaces=nsmap) - assert media_content is not None - assert media_content.attrib == { - "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}", - "type": "video/mp4", - "medium": "video", - "expression": "full", - "duration": "60", - "width": "640", - "height": "360", - "lang": "en", - } - assert len(media_content) == 0 + assert root.find("./channel/item/media:content", namespaces=nsmap) is None + + media_groups = root.findall("./channel/item/media:group", namespaces=nsmap) + assert len(media_groups) == 2 + + audio_group, video_group = media_groups + audio_variants = audio_group.findall("media:content", namespaces=nsmap) + assert [variant.attrib for variant in audio_variants] == [ + { + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr7.mp3" + ), + "type": "audio/mp3", + "medium": "audio", + "isDefault": "true", + "bitrate": "96000", + "samplingrate": "44100", + "channels": "2", + "duration": "61.2", + "fileSize": "4567", + }, + { + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr3.aac" + ), + "type": "audio/aac", + "medium": "audio", + "isDefault": "false", + "bitrate": "88000", + "samplingrate": "48000", + "channels": "2", + "duration": "61.2", + "fileSize": "3456", + }, + ] + + video_variants = video_group.findall("media:content", namespaces=nsmap) + assert [variant.attrib for variant in video_variants] == [ + { + "url": ( + f"https://mirror.example/feeds/demo/video/" + f"{local_video_path(source_video)}-720.mp4" + ), + "type": "video/mp4", + "medium": "video", + "isDefault": "true", + "expression": "full", + "bitrate": "123456", + "framerate": "30/1", + "duration": "60.0", + "height": "720", + "width": "1280", + "lang": "en", + "fileSize": "9876", + } + ] itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) assert itunes_image is not None diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index 284a9fc..64e43e9 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -4,8 +4,9 @@ from scrapy.http import TextResponse from scrapy.settings import Settings from repub import entrypoint as entrypoint_module +from repub import settings as repub_settings from repub.spiders.rss_spider import RssFeedSpider -from repub.utils import FileType, local_audio_path, local_image_path +from repub.utils import FileType, local_audio_path, local_image_path, local_video_path def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None: @@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, + "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) @@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: FileType.AUDIO, "https://example.com/media/podcast.mp3", ) - == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}" + == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3" + ) + assert ( + spider.rewrite_file_url( + FileType.VIDEO, + "https://example.com/media/clip.mp4", + ) + == f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4" ) @@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, + "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) response = TextResponse( diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index e6904a6..15c6a80 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1,8 +1,11 @@ import sys from pathlib import Path from types import SimpleNamespace +from typing import Any, cast import pytest +from scrapy.crawler import Crawler +from scrapy.http import Request, Response from repub import media from repub.config import ( @@ -11,7 +14,9 @@ from repub.config import ( build_base_settings, build_feed_settings, ) +from repub.items import ElementItem from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline +from repub.utils import local_audio_path, local_video_path def build_test_crawler(tmp_path: Path) -> SimpleNamespace: @@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace: ) base_settings = build_base_settings(config) settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa") + settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline") return SimpleNamespace(settings=settings, request_fingerprinter=object()) +def spider_info() -> Any: + return SimpleNamespace(spider=SimpleNamespace()) + + +def store_dir(pipeline: Any) -> Path: + return Path(cast(Any, pipeline.store).basedir) + + @pytest.mark.parametrize( ("pipeline_cls", "store_setting"), [ @@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store( ) -> None: crawler = build_test_crawler(tmp_path) - pipeline = pipeline_cls.from_crawler(crawler) + pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler)) assert pipeline.settings is crawler.settings - assert pipeline.store.basedir == crawler.settings[store_setting] + assert store_dir(pipeline) == Path(crawler.settings[store_setting]) def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None: @@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error( assert ("video-stderr", True) in printed assert ("video-stdout", False) in printed + + +def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + persisted: list[tuple[str, str]] = [] + source_url = "https://example.com/podcast.mp3" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[], + images=[], + file_urls=[], + files=[], + audio_urls=[source_url], + audios=[], + video_urls=[], + videos=[], + ) + + def fake_transcode( + input_file: str, settings: media.MediaSettings, tmp_dir: str + ) -> str: + output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}" + output_path.write_bytes(settings["name"].encode("utf-8")) + return str(output_path) + + def fake_probe_media(file_path: str): + if file_path.endswith("vbr7.mp3"): + return { + "format": { + "duration": "61.2", + "size": "4567", + "bit_rate": "96000", + "format_name": "mp3", + "format_long_name": "MP3", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "mp3", + "bit_rate": "96000", + "duration_ts": "61200", + "sample_rate": "44100", + "channels": 2, + } + ], + } + return { + "format": { + "duration": "61.2", + "size": "3456", + "bit_rate": "88000", + "format_name": "aac", + "format_long_name": "AAC", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "aac", + "bit_rate": "88000", + "duration_ts": "61200", + "sample_rate": "48000", + "channels": 2, + } + ], + } + + monkeypatch.setattr(pipeline, "transcode", fake_transcode) + monkeypatch.setattr(media, "probe_media", fake_probe_media) + + def fake_persist_file(path, buf, info, meta=None, headers=None): + del info, meta + assert headers is not None + target = store_dir(pipeline) / path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(buf.read()) + persisted.append((path, headers["Content-Type"])) + + monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file) + + result = pipeline.media_downloaded( + Response(url=source_url, body=b"source-bytes", status=200), + Request(source_url), + spider_info(), + item=item, + ) + + audio_base_path = local_audio_path(source_url) + assert isinstance(result, dict) + assert isinstance(result["checksum"], str) + assert result == { + "url": source_url, + "path": f"{audio_base_path}-vbr7.mp3", + "published_url": ( + f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3" + ), + "checksum": result["checksum"], + "status": "downloaded", + "variants": [ + { + "url": ( + f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3" + ), + "path": f"{audio_base_path}-vbr7.mp3", + "type": "audio/mp3", + "medium": "audio", + "isDefault": "true", + "fileSize": "4567", + "bitrate": 96000, + "duration": "61.2", + "samplingrate": 44100, + "channels": 2, + }, + { + "url": ( + f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac" + ), + "path": f"{audio_base_path}-vbr3.aac", + "type": "audio/aac", + "medium": "audio", + "isDefault": "false", + "fileSize": "3456", + "bitrate": 88000, + "duration": "61.2", + "samplingrate": 48000, + "channels": 2, + }, + ], + } + assert persisted == [ + (f"{audio_base_path}-vbr7.mp3", "audio/mp3"), + (f"{audio_base_path}-vbr3.aac", "audio/aac"), + ] + + completed_item = pipeline.item_completed( + [(True, result)], + item, + spider_info(), + ) + assert completed_item.audios == [result] + + +def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + persisted: list[tuple[str, str]] = [] + source_url = "https://example.com/video.mp4" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[source_url], + videos=[], + ) + + def fake_transcode( + input_file: str, settings: media.MediaSettings, tmp_dir: str + ) -> str: + output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}" + output_path.write_bytes(settings["name"].encode("utf-8")) + return str(output_path) + + monkeypatch.setattr(pipeline, "transcode", fake_transcode) + monkeypatch.setattr( + media, + "probe_media", + lambda _: { + "format": { + "duration": "60.0", + "size": "9876", + "bit_rate": "123456", + "format_name": "mp4", + "format_long_name": "MP4", + }, + "streams": [ + { + "codec_type": "video", + "codec_name": "h264", + "bit_rate": "123456", + "duration_ts": "60000", + "width": 1280, + "height": 720, + "avg_frame_rate": "30/1", + }, + { + "codec_type": "audio", + "codec_name": "mp3", + "bit_rate": "96000", + "duration_ts": "60000", + }, + ], + }, + ) + + def fake_persist_file(path, buf, info, meta=None, headers=None): + del info, meta + assert headers is not None + target = store_dir(pipeline) / path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(buf.read()) + persisted.append((path, headers["Content-Type"])) + + monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file) + + result = pipeline.media_downloaded( + Response(url=source_url, body=b"video-bytes", status=200), + Request(source_url), + spider_info(), + item=item, + ) + + video_base_path = local_video_path(source_url) + assert isinstance(result, dict) + assert isinstance(result["checksum"], str) + assert result == { + "url": source_url, + "path": f"{video_base_path}-720.mp4", + "published_url": ( + f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4" + ), + "checksum": result["checksum"], + "status": "downloaded", + "variants": [ + { + "url": ( + f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4" + ), + "path": f"{video_base_path}-720.mp4", + "type": "video/mp4", + "medium": "video", + "isDefault": "true", + "fileSize": "9876", + "bitrate": 123456, + "duration": "60.0", + "width": 1280, + "height": 720, + "framerate": "30/1", + } + ], + } + assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")] + + +def test_audio_pipeline_media_to_download_checks_canonical_path( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/podcast.mp3" + audio_base_path = local_audio_path(source_url) + canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3" + secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac" + canonical_path.parent.mkdir(parents=True, exist_ok=True) + canonical_path.write_bytes(b"default") + secondary_path.write_bytes(b"alt") + stat_paths: list[str] = [] + original_stat_file = pipeline.store.stat_file + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[], + images=[], + file_urls=[], + files=[], + audio_urls=[source_url], + audios=[], + video_urls=[], + videos=[], + ) + + def wrapped_stat_file(path, info): + stat_paths.append(path) + return original_stat_file(path, info) + + monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file) + monkeypatch.setattr( + media, + "probe_media", + lambda file_path: { + "format": { + "duration": "61.2", + "size": "4567" if file_path.endswith("vbr7.mp3") else "3456", + "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000", + "format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac", + "format_long_name": "Audio", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac", + "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000", + "duration_ts": "61200", + "sample_rate": ( + "44100" if file_path.endswith("vbr7.mp3") else "48000" + ), + "channels": 2, + } + ], + }, + ) + + result = pipeline.media_to_download( + Request(source_url), + spider_info(), + item=item, + ) + assert result is not None + assert result["path"] == f"{audio_base_path}-vbr7.mp3" + assert result["status"] == "uptodate" + assert f"{audio_base_path}.mp3" not in stat_paths + assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"