diff --git a/repub/pipelines.py b/repub/pipelines.py index 03d147d..c2b11e3 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,5 +1,6 @@ import hashlib import logging +import mimetypes import tempfile import time from io import BytesIO @@ -116,6 +117,26 @@ class TranscodePipeline(BaseFilesPipeline): for index, setting in enumerate(settings) ] + def original_path(self, source_url: str) -> str: + if self.media_type == repub.utils.FileType.AUDIO: + return repub.utils.local_audio_path(source_url) + if self.media_type == repub.utils.FileType.VIDEO: + return repub.utils.local_video_path(source_url) + raise ValueError(f"Unsupported media type: {self.media_type}") + + def original_mimetype(self, source_url: str, response=None) -> str: + if response is not None: + content_type = response.headers.get(b"Content-Type") + if content_type: + return content_type.decode("utf-8").split(";", 1)[0].strip() + mimetype = mimetypes.guess_type(source_url)[0] + if mimetype: + return mimetype + return { + repub.utils.FileType.AUDIO: "audio/mpeg", + repub.utils.FileType.VIDEO: "video/mp4", + }[self.media_type] + def published_url(self, path: str, item=None) -> str: relative_path = f"{self.media_dir()}/{path}" feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") @@ -130,7 +151,7 @@ class TranscodePipeline(BaseFilesPipeline): self, *, path: str, - setting: media.MediaSettings, + mimetype: str, probe_result: dict[str, Any], is_default: bool, item=None, @@ -138,7 +159,7 @@ class TranscodePipeline(BaseFilesPipeline): variant: MediaVariant = { "url": self.published_url(path, item), "path": path, - "type": setting["mimetype"], + "type": mimetype, "medium": self.media_type.value, "isDefault": "true" if is_default else "false", } @@ -158,12 +179,24 @@ class TranscodePipeline(BaseFilesPipeline): variants.append( self.media_variant( path=path, - setting=setting, + mimetype=setting["mimetype"], probe_result=probe_result, is_default=is_default, item=item, ) ) + original_path = self.original_path(request.url) + original_file = self.local_store_path(original_path) + if original_file.exists(): + variants.append( + self.media_variant( + path=original_path, + mimetype=self.original_mimetype(request.url), + probe_result=media.probe_media(str(original_file)), + is_default=False, + item=item, + ) + ) return variants def make_file_result( @@ -201,6 +234,11 @@ class TranscodePipeline(BaseFilesPipeline): for _, _, path in self.variant_paths(request.url): if not cast(dict[str, Any] | None, self.store.stat_file(path, info)): return None + if not cast( + dict[str, Any] | None, + self.store.stat_file(self.original_path(request.url), info), + ): + return None self.inc_stats("uptodate") return self.make_file_result( request, @@ -218,6 +256,23 @@ class TranscodePipeline(BaseFilesPipeline): tmp_file = f"{tmp_dir}/original" with open(tmp_file, "wb") as f: f.write(response.body) + original_path = self.original_path(request.url) + if not cast( + dict[str, Any] | None, + self.store.stat_file(original_path, info), + ): + original_buf = read_asset(tmp_file) + self.store.persist_file( + original_path, + original_buf, + info, + meta=self.get_media_meta(media.probe_media(tmp_file)), + headers={ + "Content-Type": self.original_mimetype( + request.url, response=response + ) + }, + ) for _, setting, final_path in self.variant_paths(request.url): stat = cast( dict[str, Any] | None, diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index ad3446a..8b1ede5 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -114,6 +114,21 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "samplingrate": "48000", "channels": "2", }, + { + "url": _published_url( + "https://mirror.example", + f"audio/{audio_base_path}", + ), + "path": audio_base_path, + "type": "audio/mpeg", + "medium": "audio", + "isDefault": "false", + "fileSize": "5678", + "bitrate": "128000", + "duration": "61.2", + "samplingrate": "44100", + "channels": "2", + }, ], } ] @@ -143,7 +158,23 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "width": "1280", "height": "720", "framerate": "30/1", - } + }, + { + "url": _published_url( + "https://mirror.example", + f"video/{video_base_path}", + ), + "path": video_base_path, + "type": "video/mp4", + "medium": "video", + "isDefault": "false", + "fileSize": "12345", + "bitrate": "456789", + "duration": "60.0", + "width": "640", + "height": "360", + "framerate": "24/1", + }, ], } ] @@ -271,6 +302,20 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "duration": "61.2", "fileSize": "3456", }, + { + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}" + ), + "type": "audio/mpeg", + "medium": "audio", + "isDefault": "false", + "bitrate": "128000", + "samplingrate": "44100", + "channels": "2", + "duration": "61.2", + "fileSize": "5678", + }, ] video_variants = video_group.findall("media:content", namespaces=nsmap) @@ -291,7 +336,24 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "width": "1280", "lang": "en", "fileSize": "9876", - } + }, + { + "url": ( + f"https://mirror.example/feeds/demo/video/" + f"{local_video_path(source_video)}" + ), + "type": "video/mp4", + "medium": "video", + "isDefault": "false", + "expression": "full", + "bitrate": "456789", + "framerate": "24/1", + "duration": "60.0", + "height": "360", + "width": "640", + "lang": "en", + "fileSize": "12345", + }, ] itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 15c6a80..9aa9a25 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -233,7 +233,7 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant return str(output_path) def fake_probe_media(file_path: str): - if file_path.endswith("vbr7.mp3"): + if file_path.endswith(".mp3-vbr7.mp3"): return { "format": { "duration": "61.2", @@ -253,6 +253,26 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant } ], } + if file_path.endswith(".mp3"): + return { + "format": { + "duration": "61.2", + "size": "5678", + "bit_rate": "128000", + "format_name": "mp3", + "format_long_name": "MP3", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "mp3", + "bit_rate": "128000", + "duration_ts": "61200", + "sample_rate": "44100", + "channels": 2, + } + ], + } return { "format": { "duration": "61.2", @@ -333,9 +353,22 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant "samplingrate": 48000, "channels": 2, }, + { + "url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}", + "path": audio_base_path, + "type": "audio/mpeg", + "medium": "audio", + "isDefault": "false", + "fileSize": "5678", + "bitrate": 128000, + "duration": "61.2", + "samplingrate": 44100, + "channels": 2, + }, ], } assert persisted == [ + (audio_base_path, "audio/mpeg"), (f"{audio_base_path}-vbr7.mp3", "audio/mp3"), (f"{audio_base_path}-vbr3.aac", "audio/aac"), ] @@ -383,8 +416,16 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant lambda _: { "format": { "duration": "60.0", - "size": "9876", - "bit_rate": "123456", + "size": ( + "12345" + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else "9876" + ), + "bit_rate": ( + "456789" + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else "123456" + ), "format_name": "mp4", "format_long_name": "MP4", }, @@ -392,11 +433,27 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant { "codec_type": "video", "codec_name": "h264", - "bit_rate": "123456", + "bit_rate": ( + "456789" + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else "123456" + ), "duration_ts": "60000", - "width": 1280, - "height": 720, - "avg_frame_rate": "30/1", + "width": ( + 640 + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else 1280 + ), + "height": ( + 360 + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else 720 + ), + "avg_frame_rate": ( + "24/1" + if _.endswith(".mp4") and not _.endswith("-720.mp4") + else "30/1" + ), }, { "codec_type": "audio", @@ -451,10 +508,26 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant "width": 1280, "height": 720, "framerate": "30/1", - } + }, + { + "url": f"https://mirror.example/feeds/nasa/video/{video_base_path}", + "path": video_base_path, + "type": "video/mp4", + "medium": "video", + "isDefault": "false", + "fileSize": "12345", + "bitrate": 456789, + "duration": "60.0", + "width": 640, + "height": 360, + "framerate": "24/1", + }, ], } - assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")] + assert persisted == [ + (video_base_path, "video/mp4"), + (f"{video_base_path}-720.mp4", "video/mp4"), + ] def test_audio_pipeline_media_to_download_checks_canonical_path( @@ -465,8 +538,11 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) source_url = "https://example.com/podcast.mp3" audio_base_path = local_audio_path(source_url) + original_path = store_dir(pipeline) / audio_base_path canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3" secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac" + original_path.parent.mkdir(parents=True, exist_ok=True) + original_path.write_bytes(b"original") canonical_path.parent.mkdir(parents=True, exist_ok=True) canonical_path.write_bytes(b"default") secondary_path.write_bytes(b"alt") @@ -524,5 +600,10 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( assert result is not None assert result["path"] == f"{audio_base_path}-vbr7.mp3" assert result["status"] == "uptodate" + assert [variant.get("path") for variant in result["variants"]] == [ + f"{audio_base_path}-vbr7.mp3", + f"{audio_base_path}-vbr3.aac", + audio_base_path, + ] assert f"{audio_base_path}.mp3" not in stat_paths assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"