Fix published paths for transcoded media

2026-03-31 14:14:46 +02:00 · 2026-03-31 14:14:46 +02:00 · 89d462e280
commit 89d462e280
parent 3f33994cdc
9 changed files with 956 additions and 114 deletions
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@ -3,22 +3,34 @@ from __future__ import annotations
 import re
 from email.utils import parsedate_to_datetime
 from io import BytesIO
+from typing import Callable

 import lxml.etree as etree
 from scrapy.http import TextResponse
 from scrapy.settings import Settings

+from repub import settings as repub_settings
 from repub.exporters import RssExporter
+from repub.items import ElementItem
 from repub.rss import nsmap
 from repub.spiders.rss_spider import RssFeedSpider
-from repub.utils import local_audio_path, local_file_path, local_image_path
+from repub.utils import local_audio_path, local_image_path, local_video_path

 RSS_DATE_PATTERN = re.compile(
    r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
 )


-def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
+def _published_url(feed_url: str, path: str) -> str:
+    return f"{feed_url}/feeds/demo/{path}"
+
+
+def _serialize_feed(
+    *,
+    feed_text: str,
+    feed_url: str,
+    prepare_item: Callable[[ElementItem], None] | None = None,
+) -> tuple[str, etree._Element]:
    spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
    spider.settings = Settings(
        values={
@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
+            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
            "REPUBLISHER_FEED_URL": feed_url,
        }
    )
@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
    exporter = RssExporter(output)
    exporter.start_exporting()
    for item in list(spider._parse(response) or []):
+        if prepare_item is not None and isinstance(item, ElementItem):
+            prepare_item(item)
        exporter.export_item(item)
    exporter.finish_exporting()

@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    source_video = "https://source.example/media/video.mp4"
    channel_image = "https://source.example/media/channel.png"
    item_image = "https://source.example/media/cover.jpg"
+
+    def prepare_item(item: ElementItem) -> None:
+        audio_base_path = local_audio_path(source_audio)
+        video_base_path = local_video_path(source_video)
+        item.audios = [
+            {
+                "url": source_audio,
+                "path": f"{audio_base_path}-vbr7.mp3",
+                "published_url": _published_url(
+                    "https://mirror.example",
+                    f"audio/{audio_base_path}-vbr7.mp3",
+                ),
+                "checksum": "audio-default",
+                "status": "downloaded",
+                "variants": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"audio/{audio_base_path}-vbr7.mp3",
+                        ),
+                        "path": f"{audio_base_path}-vbr7.mp3",
+                        "type": "audio/mp3",
+                        "medium": "audio",
+                        "isDefault": "true",
+                        "fileSize": "4567",
+                        "bitrate": "96000",
+                        "duration": "61.2",
+                        "samplingrate": "44100",
+                        "channels": "2",
+                    },
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"audio/{audio_base_path}-vbr3.aac",
+                        ),
+                        "path": f"{audio_base_path}-vbr3.aac",
+                        "type": "audio/aac",
+                        "medium": "audio",
+                        "isDefault": "false",
+                        "fileSize": "3456",
+                        "bitrate": "88000",
+                        "duration": "61.2",
+                        "samplingrate": "48000",
+                        "channels": "2",
+                    },
+                ],
+            }
+        ]
+        item.videos = [
+            {
+                "url": source_video,
+                "path": f"{video_base_path}-720.mp4",
+                "published_url": _published_url(
+                    "https://mirror.example",
+                    f"video/{video_base_path}-720.mp4",
+                ),
+                "checksum": "video-default",
+                "status": "downloaded",
+                "variants": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"video/{video_base_path}-720.mp4",
+                        ),
+                        "path": f"{video_base_path}-720.mp4",
+                        "type": "video/mp4",
+                        "medium": "video",
+                        "isDefault": "true",
+                        "fileSize": "9876",
+                        "bitrate": "123456",
+                        "duration": "60.0",
+                        "width": "1280",
+                        "height": "720",
+                        "framerate": "30/1",
+                    }
+                ],
+            }
+        ]
+
    xml, root = _serialize_feed(
        feed_url="https://mirror.example",
+        prepare_item=prepare_item,
        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
 <rss version="2.0"
     xmlns:content="http://purl.org/rss/1.0/modules/content/"
@ -130,25 +226,73 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    enclosure = root.find("./channel/item/enclosure")
    assert enclosure is not None
    assert enclosure.attrib == {
-        "url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
-        "length": "123",
-        "type": "audio/mpeg",
+        "url": (
+            f"https://mirror.example/feeds/demo/audio/"
+            f"{local_audio_path(source_audio)}-vbr7.mp3"
+        ),
+        "length": "4567",
+        "type": "audio/mp3",
    }
    assert len(enclosure) == 0

-    media_content = root.find("./channel/item/media:content", namespaces=nsmap)
-    assert media_content is not None
-    assert media_content.attrib == {
-        "url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
-        "type": "video/mp4",
-        "medium": "video",
-        "expression": "full",
-        "duration": "60",
-        "width": "640",
-        "height": "360",
-        "lang": "en",
-    }
-    assert len(media_content) == 0
+    assert root.find("./channel/item/media:content", namespaces=nsmap) is None
+
+    media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
+    assert len(media_groups) == 2
+
+    audio_group, video_group = media_groups
+    audio_variants = audio_group.findall("media:content", namespaces=nsmap)
+    assert [variant.attrib for variant in audio_variants] == [
+        {
+            "url": (
+                f"https://mirror.example/feeds/demo/audio/"
+                f"{local_audio_path(source_audio)}-vbr7.mp3"
+            ),
+            "type": "audio/mp3",
+            "medium": "audio",
+            "isDefault": "true",
+            "bitrate": "96000",
+            "samplingrate": "44100",
+            "channels": "2",
+            "duration": "61.2",
+            "fileSize": "4567",
+        },
+        {
+            "url": (
+                f"https://mirror.example/feeds/demo/audio/"
+                f"{local_audio_path(source_audio)}-vbr3.aac"
+            ),
+            "type": "audio/aac",
+            "medium": "audio",
+            "isDefault": "false",
+            "bitrate": "88000",
+            "samplingrate": "48000",
+            "channels": "2",
+            "duration": "61.2",
+            "fileSize": "3456",
+        },
+    ]
+
+    video_variants = video_group.findall("media:content", namespaces=nsmap)
+    assert [variant.attrib for variant in video_variants] == [
+        {
+            "url": (
+                f"https://mirror.example/feeds/demo/video/"
+                f"{local_video_path(source_video)}-720.mp4"
+            ),
+            "type": "video/mp4",
+            "medium": "video",
+            "isDefault": "true",
+            "expression": "full",
+            "bitrate": "123456",
+            "framerate": "30/1",
+            "duration": "60.0",
+            "height": "720",
+            "width": "1280",
+            "lang": "en",
+            "fileSize": "9876",
+        }
+    ]

    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
    assert itunes_image is not None
--- a/tests/test_file_feeds.py
+++ b/tests/test_file_feeds.py
@ -4,8 +4,9 @@ from scrapy.http import TextResponse
 from scrapy.settings import Settings

 from repub import entrypoint as entrypoint_module
+from repub import settings as repub_settings
 from repub.spiders.rss_spider import RssFeedSpider
-from repub.utils import FileType, local_audio_path, local_image_path
+from repub.utils import FileType, local_audio_path, local_image_path, local_video_path


 def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
+            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
        }
    )

@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
            FileType.AUDIO,
            "https://example.com/media/podcast.mp3",
        )
-        == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
+        == f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
+    )
+    assert (
+        spider.rewrite_file_url(
+            FileType.VIDEO,
+            "https://example.com/media/clip.mp4",
+        )
+        == f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
    )


@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
+            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
        }
    )
    response = TextResponse(
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -1,8 +1,11 @@
 import sys
 from pathlib import Path
 from types import SimpleNamespace
+from typing import Any, cast

 import pytest
+from scrapy.crawler import Crawler
+from scrapy.http import Request, Response

 from repub import media
 from repub.config import (
@ -11,7 +14,9 @@ from repub.config import (
    build_base_settings,
    build_feed_settings,
 )
+from repub.items import ElementItem
 from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
+from repub.utils import local_audio_path, local_video_path


 def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
    )
    base_settings = build_base_settings(config)
    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
+    settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
    return SimpleNamespace(settings=settings, request_fingerprinter=object())


+def spider_info() -> Any:
+    return SimpleNamespace(spider=SimpleNamespace())
+
+
+def store_dir(pipeline: Any) -> Path:
+    return Path(cast(Any, pipeline.store).basedir)
+
+
@pytest.mark.parametrize(
    ("pipeline_cls", "store_setting"),
    [
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
 ) -> None:
    crawler = build_test_crawler(tmp_path)

-    pipeline = pipeline_cls.from_crawler(crawler)
+    pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))

    assert pipeline.settings is crawler.settings
-    assert pipeline.store.basedir == crawler.settings[store_setting]
+    assert store_dir(pipeline) == Path(crawler.settings[store_setting])


 def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(

    assert ("video-stderr", True) in printed
    assert ("video-stdout", False) in printed
+
+
+def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    persisted: list[tuple[str, str]] = []
+    source_url = "https://example.com/podcast.mp3"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[source_url],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    def fake_transcode(
+        input_file: str, settings: media.MediaSettings, tmp_dir: str
+    ) -> str:
+        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
+        output_path.write_bytes(settings["name"].encode("utf-8"))
+        return str(output_path)
+
+    def fake_probe_media(file_path: str):
+        if file_path.endswith("vbr7.mp3"):
+            return {
+                "format": {
+                    "duration": "61.2",
+                    "size": "4567",
+                    "bit_rate": "96000",
+                    "format_name": "mp3",
+                    "format_long_name": "MP3",
+                },
+                "streams": [
+                    {
+                        "codec_type": "audio",
+                        "codec_name": "mp3",
+                        "bit_rate": "96000",
+                        "duration_ts": "61200",
+                        "sample_rate": "44100",
+                        "channels": 2,
+                    }
+                ],
+            }
+        return {
+            "format": {
+                "duration": "61.2",
+                "size": "3456",
+                "bit_rate": "88000",
+                "format_name": "aac",
+                "format_long_name": "AAC",
+            },
+            "streams": [
+                {
+                    "codec_type": "audio",
+                    "codec_name": "aac",
+                    "bit_rate": "88000",
+                    "duration_ts": "61200",
+                    "sample_rate": "48000",
+                    "channels": 2,
+                }
+            ],
+        }
+
+    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
+    monkeypatch.setattr(media, "probe_media", fake_probe_media)
+
+    def fake_persist_file(path, buf, info, meta=None, headers=None):
+        del info, meta
+        assert headers is not None
+        target = store_dir(pipeline) / path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_bytes(buf.read())
+        persisted.append((path, headers["Content-Type"]))
+
+    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+
+    result = pipeline.media_downloaded(
+        Response(url=source_url, body=b"source-bytes", status=200),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+
+    audio_base_path = local_audio_path(source_url)
+    assert isinstance(result, dict)
+    assert isinstance(result["checksum"], str)
+    assert result == {
+        "url": source_url,
+        "path": f"{audio_base_path}-vbr7.mp3",
+        "published_url": (
+            f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
+        ),
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "variants": [
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
+                ),
+                "path": f"{audio_base_path}-vbr7.mp3",
+                "type": "audio/mp3",
+                "medium": "audio",
+                "isDefault": "true",
+                "fileSize": "4567",
+                "bitrate": 96000,
+                "duration": "61.2",
+                "samplingrate": 44100,
+                "channels": 2,
+            },
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
+                ),
+                "path": f"{audio_base_path}-vbr3.aac",
+                "type": "audio/aac",
+                "medium": "audio",
+                "isDefault": "false",
+                "fileSize": "3456",
+                "bitrate": 88000,
+                "duration": "61.2",
+                "samplingrate": 48000,
+                "channels": 2,
+            },
+        ],
+    }
+    assert persisted == [
+        (f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
+        (f"{audio_base_path}-vbr3.aac", "audio/aac"),
+    ]
+
+    completed_item = pipeline.item_completed(
+        [(True, result)],
+        item,
+        spider_info(),
+    )
+    assert completed_item.audios == [result]
+
+
+def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    persisted: list[tuple[str, str]] = []
+    source_url = "https://example.com/video.mp4"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[source_url],
+        videos=[],
+    )
+
+    def fake_transcode(
+        input_file: str, settings: media.MediaSettings, tmp_dir: str
+    ) -> str:
+        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
+        output_path.write_bytes(settings["name"].encode("utf-8"))
+        return str(output_path)
+
+    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
+    monkeypatch.setattr(
+        media,
+        "probe_media",
+        lambda _: {
+            "format": {
+                "duration": "60.0",
+                "size": "9876",
+                "bit_rate": "123456",
+                "format_name": "mp4",
+                "format_long_name": "MP4",
+            },
+            "streams": [
+                {
+                    "codec_type": "video",
+                    "codec_name": "h264",
+                    "bit_rate": "123456",
+                    "duration_ts": "60000",
+                    "width": 1280,
+                    "height": 720,
+                    "avg_frame_rate": "30/1",
+                },
+                {
+                    "codec_type": "audio",
+                    "codec_name": "mp3",
+                    "bit_rate": "96000",
+                    "duration_ts": "60000",
+                },
+            ],
+        },
+    )
+
+    def fake_persist_file(path, buf, info, meta=None, headers=None):
+        del info, meta
+        assert headers is not None
+        target = store_dir(pipeline) / path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_bytes(buf.read())
+        persisted.append((path, headers["Content-Type"]))
+
+    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+
+    result = pipeline.media_downloaded(
+        Response(url=source_url, body=b"video-bytes", status=200),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+
+    video_base_path = local_video_path(source_url)
+    assert isinstance(result, dict)
+    assert isinstance(result["checksum"], str)
+    assert result == {
+        "url": source_url,
+        "path": f"{video_base_path}-720.mp4",
+        "published_url": (
+            f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
+        ),
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "variants": [
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
+                ),
+                "path": f"{video_base_path}-720.mp4",
+                "type": "video/mp4",
+                "medium": "video",
+                "isDefault": "true",
+                "fileSize": "9876",
+                "bitrate": 123456,
+                "duration": "60.0",
+                "width": 1280,
+                "height": 720,
+                "framerate": "30/1",
+            }
+        ],
+    }
+    assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
+
+
+def test_audio_pipeline_media_to_download_checks_canonical_path(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/podcast.mp3"
+    audio_base_path = local_audio_path(source_url)
+    canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
+    secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
+    canonical_path.parent.mkdir(parents=True, exist_ok=True)
+    canonical_path.write_bytes(b"default")
+    secondary_path.write_bytes(b"alt")
+    stat_paths: list[str] = []
+    original_stat_file = pipeline.store.stat_file
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[source_url],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    def wrapped_stat_file(path, info):
+        stat_paths.append(path)
+        return original_stat_file(path, info)
+
+    monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
+    monkeypatch.setattr(
+        media,
+        "probe_media",
+        lambda file_path: {
+            "format": {
+                "duration": "61.2",
+                "size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
+                "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
+                "format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
+                "format_long_name": "Audio",
+            },
+            "streams": [
+                {
+                    "codec_type": "audio",
+                    "codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
+                    "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
+                    "duration_ts": "61200",
+                    "sample_rate": (
+                        "44100" if file_path.endswith("vbr7.mp3") else "48000"
+                    ),
+                    "channels": 2,
+                }
+            ],
+        },
+    )
+
+    result = pipeline.media_to_download(
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    assert result is not None
+    assert result["path"] == f"{audio_base_path}-vbr7.mp3"
+    assert result["status"] == "uptodate"
+    assert f"{audio_base_path}.mp3" not in stat_paths
+    assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"