Fix published paths for transcoded media

2026-03-31 14:14:46 +02:00 · 2026-03-31 14:14:46 +02:00 · 89d462e280
commit 89d462e280
parent 3f33994cdc
9 changed files with 956 additions and 114 deletions
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -1,8 +1,11 @@
 import sys
 from pathlib import Path
 from types import SimpleNamespace
+from typing import Any, cast

 import pytest
+from scrapy.crawler import Crawler
+from scrapy.http import Request, Response

 from repub import media
 from repub.config import (
@ -11,7 +14,9 @@ from repub.config import (
    build_base_settings,
    build_feed_settings,
 )
+from repub.items import ElementItem
 from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
+from repub.utils import local_audio_path, local_video_path


 def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
    )
    base_settings = build_base_settings(config)
    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
+    settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
    return SimpleNamespace(settings=settings, request_fingerprinter=object())


+def spider_info() -> Any:
+    return SimpleNamespace(spider=SimpleNamespace())
+
+
+def store_dir(pipeline: Any) -> Path:
+    return Path(cast(Any, pipeline.store).basedir)
+
+
@pytest.mark.parametrize(
    ("pipeline_cls", "store_setting"),
    [
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
 ) -> None:
    crawler = build_test_crawler(tmp_path)

-    pipeline = pipeline_cls.from_crawler(crawler)
+    pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))

    assert pipeline.settings is crawler.settings
-    assert pipeline.store.basedir == crawler.settings[store_setting]
+    assert store_dir(pipeline) == Path(crawler.settings[store_setting])


 def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(

    assert ("video-stderr", True) in printed
    assert ("video-stdout", False) in printed
+
+
+def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    persisted: list[tuple[str, str]] = []
+    source_url = "https://example.com/podcast.mp3"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[source_url],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    def fake_transcode(
+        input_file: str, settings: media.MediaSettings, tmp_dir: str
+    ) -> str:
+        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
+        output_path.write_bytes(settings["name"].encode("utf-8"))
+        return str(output_path)
+
+    def fake_probe_media(file_path: str):
+        if file_path.endswith("vbr7.mp3"):
+            return {
+                "format": {
+                    "duration": "61.2",
+                    "size": "4567",
+                    "bit_rate": "96000",
+                    "format_name": "mp3",
+                    "format_long_name": "MP3",
+                },
+                "streams": [
+                    {
+                        "codec_type": "audio",
+                        "codec_name": "mp3",
+                        "bit_rate": "96000",
+                        "duration_ts": "61200",
+                        "sample_rate": "44100",
+                        "channels": 2,
+                    }
+                ],
+            }
+        return {
+            "format": {
+                "duration": "61.2",
+                "size": "3456",
+                "bit_rate": "88000",
+                "format_name": "aac",
+                "format_long_name": "AAC",
+            },
+            "streams": [
+                {
+                    "codec_type": "audio",
+                    "codec_name": "aac",
+                    "bit_rate": "88000",
+                    "duration_ts": "61200",
+                    "sample_rate": "48000",
+                    "channels": 2,
+                }
+            ],
+        }
+
+    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
+    monkeypatch.setattr(media, "probe_media", fake_probe_media)
+
+    def fake_persist_file(path, buf, info, meta=None, headers=None):
+        del info, meta
+        assert headers is not None
+        target = store_dir(pipeline) / path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_bytes(buf.read())
+        persisted.append((path, headers["Content-Type"]))
+
+    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+
+    result = pipeline.media_downloaded(
+        Response(url=source_url, body=b"source-bytes", status=200),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+
+    audio_base_path = local_audio_path(source_url)
+    assert isinstance(result, dict)
+    assert isinstance(result["checksum"], str)
+    assert result == {
+        "url": source_url,
+        "path": f"{audio_base_path}-vbr7.mp3",
+        "published_url": (
+            f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
+        ),
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "variants": [
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
+                ),
+                "path": f"{audio_base_path}-vbr7.mp3",
+                "type": "audio/mp3",
+                "medium": "audio",
+                "isDefault": "true",
+                "fileSize": "4567",
+                "bitrate": 96000,
+                "duration": "61.2",
+                "samplingrate": 44100,
+                "channels": 2,
+            },
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
+                ),
+                "path": f"{audio_base_path}-vbr3.aac",
+                "type": "audio/aac",
+                "medium": "audio",
+                "isDefault": "false",
+                "fileSize": "3456",
+                "bitrate": 88000,
+                "duration": "61.2",
+                "samplingrate": 48000,
+                "channels": 2,
+            },
+        ],
+    }
+    assert persisted == [
+        (f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
+        (f"{audio_base_path}-vbr3.aac", "audio/aac"),
+    ]
+
+    completed_item = pipeline.item_completed(
+        [(True, result)],
+        item,
+        spider_info(),
+    )
+    assert completed_item.audios == [result]
+
+
+def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    persisted: list[tuple[str, str]] = []
+    source_url = "https://example.com/video.mp4"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[source_url],
+        videos=[],
+    )
+
+    def fake_transcode(
+        input_file: str, settings: media.MediaSettings, tmp_dir: str
+    ) -> str:
+        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
+        output_path.write_bytes(settings["name"].encode("utf-8"))
+        return str(output_path)
+
+    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
+    monkeypatch.setattr(
+        media,
+        "probe_media",
+        lambda _: {
+            "format": {
+                "duration": "60.0",
+                "size": "9876",
+                "bit_rate": "123456",
+                "format_name": "mp4",
+                "format_long_name": "MP4",
+            },
+            "streams": [
+                {
+                    "codec_type": "video",
+                    "codec_name": "h264",
+                    "bit_rate": "123456",
+                    "duration_ts": "60000",
+                    "width": 1280,
+                    "height": 720,
+                    "avg_frame_rate": "30/1",
+                },
+                {
+                    "codec_type": "audio",
+                    "codec_name": "mp3",
+                    "bit_rate": "96000",
+                    "duration_ts": "60000",
+                },
+            ],
+        },
+    )
+
+    def fake_persist_file(path, buf, info, meta=None, headers=None):
+        del info, meta
+        assert headers is not None
+        target = store_dir(pipeline) / path
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_bytes(buf.read())
+        persisted.append((path, headers["Content-Type"]))
+
+    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+
+    result = pipeline.media_downloaded(
+        Response(url=source_url, body=b"video-bytes", status=200),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+
+    video_base_path = local_video_path(source_url)
+    assert isinstance(result, dict)
+    assert isinstance(result["checksum"], str)
+    assert result == {
+        "url": source_url,
+        "path": f"{video_base_path}-720.mp4",
+        "published_url": (
+            f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
+        ),
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "variants": [
+            {
+                "url": (
+                    f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
+                ),
+                "path": f"{video_base_path}-720.mp4",
+                "type": "video/mp4",
+                "medium": "video",
+                "isDefault": "true",
+                "fileSize": "9876",
+                "bitrate": 123456,
+                "duration": "60.0",
+                "width": 1280,
+                "height": 720,
+                "framerate": "30/1",
+            }
+        ],
+    }
+    assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
+
+
+def test_audio_pipeline_media_to_download_checks_canonical_path(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/podcast.mp3"
+    audio_base_path = local_audio_path(source_url)
+    canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
+    secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
+    canonical_path.parent.mkdir(parents=True, exist_ok=True)
+    canonical_path.write_bytes(b"default")
+    secondary_path.write_bytes(b"alt")
+    stat_paths: list[str] = []
+    original_stat_file = pipeline.store.stat_file
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[source_url],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    def wrapped_stat_file(path, info):
+        stat_paths.append(path)
+        return original_stat_file(path, info)
+
+    monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
+    monkeypatch.setattr(
+        media,
+        "probe_media",
+        lambda file_path: {
+            "format": {
+                "duration": "61.2",
+                "size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
+                "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
+                "format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
+                "format_long_name": "Audio",
+            },
+            "streams": [
+                {
+                    "codec_type": "audio",
+                    "codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
+                    "bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
+                    "duration_ts": "61200",
+                    "sample_rate": (
+                        "44100" if file_path.endswith("vbr7.mp3") else "48000"
+                    ),
+                    "channels": 2,
+                }
+            ],
+        },
+    )
+
+    result = pipeline.media_to_download(
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    assert result is not None
+    assert result["path"] == f"{audio_base_path}-vbr7.mp3"
+    assert result["status"] == "uptodate"
+    assert f"{audio_base_path}.mp3" not in stat_paths
+    assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"