Include original media in media groups

2026-03-31 14:33:49 +02:00 · 2026-03-31 14:33:49 +02:00 · 954608c5f9
commit 954608c5f9
parent 89d462e280
3 changed files with 212 additions and 14 deletions
--- a/repub/pipelines.py
+++ b/repub/pipelines.py
@ -1,5 +1,6 @@
 import hashlib
 import logging
 import mimetypes
 import tempfile
 import time
 from io import BytesIO
@ -116,6 +117,26 @@ class TranscodePipeline(BaseFilesPipeline):
            for index, setting in enumerate(settings)
        ]
    def original_path(self, source_url: str) -> str:
        if self.media_type == repub.utils.FileType.AUDIO:
            return repub.utils.local_audio_path(source_url)
        if self.media_type == repub.utils.FileType.VIDEO:
            return repub.utils.local_video_path(source_url)
        raise ValueError(f"Unsupported media type: {self.media_type}")
    def original_mimetype(self, source_url: str, response=None) -> str:
        if response is not None:
            content_type = response.headers.get(b"Content-Type")
            if content_type:
                return content_type.decode("utf-8").split(";", 1)[0].strip()
        mimetype = mimetypes.guess_type(source_url)[0]
        if mimetype:
            return mimetype
        return {
            repub.utils.FileType.AUDIO: "audio/mpeg",
            repub.utils.FileType.VIDEO: "video/mp4",
        }[self.media_type]
    def published_url(self, path: str, item=None) -> str:
        relative_path = f"{self.media_dir()}/{path}"
        feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
@ -130,7 +151,7 @@ class TranscodePipeline(BaseFilesPipeline):
        self,
        *,
        path: str,
-        setting: media.MediaSettings,
+        mimetype: str,
        probe_result: dict[str, Any],
        is_default: bool,
        item=None,
@ -138,7 +159,7 @@ class TranscodePipeline(BaseFilesPipeline):
        variant: MediaVariant = {
            "url": self.published_url(path, item),
            "path": path,
-            "type": setting["mimetype"],
+            "type": mimetype,
            "medium": self.media_type.value,
            "isDefault": "true" if is_default else "false",
        }
@ -158,12 +179,24 @@ class TranscodePipeline(BaseFilesPipeline):
            variants.append(
                self.media_variant(
                    path=path,
-                    setting=setting,
+                    mimetype=setting["mimetype"],
                    probe_result=probe_result,
                    is_default=is_default,
                    item=item,
                )
            )
        original_path = self.original_path(request.url)
        original_file = self.local_store_path(original_path)
        if original_file.exists():
            variants.append(
                self.media_variant(
                    path=original_path,
                    mimetype=self.original_mimetype(request.url),
                    probe_result=media.probe_media(str(original_file)),
                    is_default=False,
                    item=item,
                )
            )
        return variants
    def make_file_result(
@ -201,6 +234,11 @@ class TranscodePipeline(BaseFilesPipeline):
        for _, _, path in self.variant_paths(request.url):
            if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
                return None
        if not cast(
            dict[str, Any] | None,
            self.store.stat_file(self.original_path(request.url), info),
        ):
            return None
        self.inc_stats("uptodate")
        return self.make_file_result(
            request,
@ -218,6 +256,23 @@ class TranscodePipeline(BaseFilesPipeline):
            tmp_file = f"{tmp_dir}/original"
            with open(tmp_file, "wb") as f:
                f.write(response.body)
            original_path = self.original_path(request.url)
            if not cast(
                dict[str, Any] | None,
                self.store.stat_file(original_path, info),
            ):
                original_buf = read_asset(tmp_file)
                self.store.persist_file(
                    original_path,
                    original_buf,
                    info,
                    meta=self.get_media_meta(media.probe_media(tmp_file)),
                    headers={
                        "Content-Type": self.original_mimetype(
                            request.url, response=response
                        )
                    },
                )
            for _, setting, final_path in self.variant_paths(request.url):
                stat = cast(
                    dict[str, Any] | None,
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@ -114,6 +114,21 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
                        "samplingrate": "48000",
                        "channels": "2",
                    },
                    {
                        "url": _published_url(
                            "https://mirror.example",
                            f"audio/{audio_base_path}",
                        ),
                        "path": audio_base_path,
                        "type": "audio/mpeg",
                        "medium": "audio",
                        "isDefault": "false",
                        "fileSize": "5678",
                        "bitrate": "128000",
                        "duration": "61.2",
                        "samplingrate": "44100",
                        "channels": "2",
                    },
                ],
            }
        ]
@ -143,7 +158,23 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
                        "width": "1280",
                        "height": "720",
                        "framerate": "30/1",
-                    }
+                    },
                    {
                        "url": _published_url(
                            "https://mirror.example",
                            f"video/{video_base_path}",
                        ),
                        "path": video_base_path,
                        "type": "video/mp4",
                        "medium": "video",
                        "isDefault": "false",
                        "fileSize": "12345",
                        "bitrate": "456789",
                        "duration": "60.0",
                        "width": "640",
                        "height": "360",
                        "framerate": "24/1",
                    },
                ],
            }
        ]
@ -271,6 +302,20 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
            "duration": "61.2",
            "fileSize": "3456",
        },
        {
            "url": (
                f"https://mirror.example/feeds/demo/audio/"
                f"{local_audio_path(source_audio)}"
            ),
            "type": "audio/mpeg",
            "medium": "audio",
            "isDefault": "false",
            "bitrate": "128000",
            "samplingrate": "44100",
            "channels": "2",
            "duration": "61.2",
            "fileSize": "5678",
        },
    ]
    video_variants = video_group.findall("media:content", namespaces=nsmap)
@ -291,7 +336,24 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
            "width": "1280",
            "lang": "en",
            "fileSize": "9876",
-        }
+        },
        {
            "url": (
                f"https://mirror.example/feeds/demo/video/"
                f"{local_video_path(source_video)}"
            ),
            "type": "video/mp4",
            "medium": "video",
            "isDefault": "false",
            "expression": "full",
            "bitrate": "456789",
            "framerate": "24/1",
            "duration": "60.0",
            "height": "360",
            "width": "640",
            "lang": "en",
            "fileSize": "12345",
        },
    ]
    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -233,7 +233,7 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
        return str(output_path)
    def fake_probe_media(file_path: str):
-        if file_path.endswith("vbr7.mp3"):
+        if file_path.endswith(".mp3-vbr7.mp3"):
            return {
                "format": {
                    "duration": "61.2",
@ -253,6 +253,26 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
                    }
                ],
            }
        if file_path.endswith(".mp3"):
            return {
                "format": {
                    "duration": "61.2",
                    "size": "5678",
                    "bit_rate": "128000",
                    "format_name": "mp3",
                    "format_long_name": "MP3",
                },
                "streams": [
                    {
                        "codec_type": "audio",
                        "codec_name": "mp3",
                        "bit_rate": "128000",
                        "duration_ts": "61200",
                        "sample_rate": "44100",
                        "channels": 2,
                    }
                ],
            }
        return {
            "format": {
                "duration": "61.2",
@ -333,9 +353,22 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
                "samplingrate": 48000,
                "channels": 2,
            },
            {
                "url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
                "path": audio_base_path,
                "type": "audio/mpeg",
                "medium": "audio",
                "isDefault": "false",
                "fileSize": "5678",
                "bitrate": 128000,
                "duration": "61.2",
                "samplingrate": 44100,
                "channels": 2,
            },
        ],
    }
    assert persisted == [
        (audio_base_path, "audio/mpeg"),
        (f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
        (f"{audio_base_path}-vbr3.aac", "audio/aac"),
    ]
@ -383,8 +416,16 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
        lambda _: {
            "format": {
                "duration": "60.0",
-                "size": "9876",
+                "size": (
-                "bit_rate": "123456",
+                    "12345"
                    if _.endswith(".mp4") and not _.endswith("-720.mp4")
                    else "9876"
                ),
                "bit_rate": (
                    "456789"
                    if _.endswith(".mp4") and not _.endswith("-720.mp4")
                    else "123456"
                ),
                "format_name": "mp4",
                "format_long_name": "MP4",
            },
@ -392,11 +433,27 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
                {
                    "codec_type": "video",
                    "codec_name": "h264",
-                    "bit_rate": "123456",
+                    "bit_rate": (
                        "456789"
                        if _.endswith(".mp4") and not _.endswith("-720.mp4")
                        else "123456"
                    ),
                    "duration_ts": "60000",
-                    "width": 1280,
+                    "width": (
-                    "height": 720,
+                        640
-                    "avg_frame_rate": "30/1",
+                        if _.endswith(".mp4") and not _.endswith("-720.mp4")
                        else 1280
                    ),
                    "height": (
                        360
                        if _.endswith(".mp4") and not _.endswith("-720.mp4")
                        else 720
                    ),
                    "avg_frame_rate": (
                        "24/1"
                        if _.endswith(".mp4") and not _.endswith("-720.mp4")
                        else "30/1"
                    ),
                },
                {
                    "codec_type": "audio",
@ -451,10 +508,26 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
                "width": 1280,
                "height": 720,
                "framerate": "30/1",
-            }
+            },
            {
                "url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
                "path": video_base_path,
                "type": "video/mp4",
                "medium": "video",
                "isDefault": "false",
                "fileSize": "12345",
                "bitrate": 456789,
                "duration": "60.0",
                "width": 640,
                "height": 360,
                "framerate": "24/1",
            },
        ],
    }
-    assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
+    assert persisted == [
        (video_base_path, "video/mp4"),
        (f"{video_base_path}-720.mp4", "video/mp4"),
    ]
 def test_audio_pipeline_media_to_download_checks_canonical_path(
@ -465,8 +538,11 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    source_url = "https://example.com/podcast.mp3"
    audio_base_path = local_audio_path(source_url)
    original_path = store_dir(pipeline) / audio_base_path
    canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
    secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
    original_path.parent.mkdir(parents=True, exist_ok=True)
    original_path.write_bytes(b"original")
    canonical_path.parent.mkdir(parents=True, exist_ok=True)
    canonical_path.write_bytes(b"default")
    secondary_path.write_bytes(b"alt")
@ -524,5 +600,10 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
    assert result is not None
    assert result["path"] == f"{audio_base_path}-vbr7.mp3"
    assert result["status"] == "uptodate"
    assert [variant.get("path") for variant in result["variants"]] == [
        f"{audio_base_path}-vbr7.mp3",
        f"{audio_base_path}-vbr3.aac",
        audio_base_path,
    ]
    assert f"{audio_base_path}.mp3" not in stat_paths
    assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"