diff --git a/repub/media.py b/repub/media.py index fe8074b..aec3a4d 100644 --- a/repub/media.py +++ b/repub/media.py @@ -77,7 +77,7 @@ def probe_media(file_path) -> Dict[str, Any]: def bitrate(info) -> float: try: return int(info["format"]["bit_rate"]) - except KeyError | ValueError: + except (KeyError, ValueError): logger.error("extracting bitrate from ffprobe failed") return math.inf @@ -85,16 +85,34 @@ def bitrate(info) -> float: def format_name(info) -> Optional[str]: try: return info["format"]["format_name"] - except KeyError | ValueError: + except (KeyError, ValueError): logger.error("extracting format from ffprobe failed") return None +def _stream_duration_sort_key(stream: Dict[str, Any]) -> tuple[int, float]: + duration_ts = _int_value(stream.get("duration_ts")) + if duration_ts is not None: + return 1, float(duration_ts) + try: + duration = float(str(stream.get("duration", ""))) + except (TypeError, ValueError): + duration = 0.0 + return 0, duration + + +def _matches_format(probe: Dict[str, Any], expected: str) -> bool: + current = format_name(probe) + if current is None: + return False + return expected in current.split(",") + + def primary_video_stream(probe): video_streams = [ stream for stream in probe["streams"] if stream["codec_type"] == "video" ] - video_streams = sorted(video_streams, key=lambda x: x["duration_ts"], reverse=True) + video_streams = sorted(video_streams, key=_stream_duration_sort_key, reverse=True) if not video_streams: return None if len(video_streams) > 1: @@ -108,7 +126,7 @@ def primary_audio_stream(probe): audio_streams = [ stream for stream in probe["streams"] if stream["codec_type"] == "audio" ] - audio_streams = sorted(audio_streams, key=lambda x: x["duration_ts"], reverse=True) + audio_streams = sorted(audio_streams, key=_stream_duration_sort_key, reverse=True) if not audio_streams: return None if len(audio_streams) > 1: @@ -126,7 +144,7 @@ def get_resolution(probe) -> Tuple[Optional[float], Optional[float]]: width = int(video_stream["width"]) height = int(video_stream["height"]) return width, height - except KeyError | ValueError: + except (KeyError, ValueError): logger.error("extracting resolution from ffprobe failed") return None, None @@ -137,7 +155,7 @@ def get_vcodec_name(probe) -> Optional[str]: if not video_stream: return None return video_stream["codec_name"] - except KeyError | ValueError: + except (KeyError, ValueError): logger.error("extracting video codec_name from ffprobe failed") return None @@ -147,8 +165,11 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]: audio_stream = primary_audio_stream(probe) if not audio_stream: return None, None - return audio_stream["codec_name"], int(audio_stream["bit_rate"]) - except KeyError | ValueError: + audio_bitrate = _int_value( + audio_stream.get("bit_rate") or probe["format"].get("bit_rate") + ) + return audio_stream["codec_name"], audio_bitrate + except (KeyError, ValueError): logger.error("extracting audio codec_name from ffprobe failed") return None, None @@ -218,7 +239,7 @@ def audio_transcode_params( is_br = True else: is_br = False - if format_name(probe_result) == fmt: + if _matches_format(probe_result, fmt): is_fmt = True else: is_fmt = False @@ -289,11 +310,7 @@ def video_transcode_params( # TODO: turn this into an exception and catch it for reporting return None - current_container_many = format_name(probe_result) - is_container = False - if current_container_many is not None: - if target_container in current_container_many.split(","): - is_container = True + is_container = _matches_format(probe_result, target_container) is_vcodec = vcodec == target_vcodec is_acodec = acodec == target_acodec diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 22589b4..290a90a 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -14,7 +14,13 @@ from repub.exporters import RssExporter from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider -from repub.utils import local_audio_path, local_image_path, local_video_path +from repub.utils import ( + FileType, + local_audio_path, + local_image_path, + local_video_path, + published_media_path, +) RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" @@ -69,17 +75,32 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" + audio_base_path = local_audio_path(source_audio) + audio_default_path = published_media_path( + FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] + ) + audio_m4a_path = published_media_path( + FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1] + ) + audio_webm_path = published_media_path( + FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2] + ) + video_base_path = local_video_path(source_video) + video_main_path = published_media_path( + FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0] + ) + video_fallback_path = published_media_path( + FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1] + ) def prepare_item(item: ElementItem) -> None: - audio_base_path = local_audio_path(source_audio) - video_base_path = local_video_path(source_video) item.audios = [ { "url": source_audio, - "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", + "path": audio_default_path, "published_url": _published_url( "https://mirror.example", - f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", + f"audio/{audio_default_path}", ), "checksum": "audio-default", "status": "downloaded", @@ -87,32 +108,47 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: { "url": _published_url( "https://mirror.example", - f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", + f"audio/{audio_default_path}", ), - "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", - "type": "audio/mp3", + "path": audio_default_path, + "type": "audio/mpeg", "medium": "audio", "isDefault": "true", "fileSize": "4567", - "bitrate": "96000", + "bitrate": "37209", "duration": "61.2", - "samplingrate": "44100", - "channels": "2", + "samplingrate": "48000", + "channels": "1", }, { "url": _published_url( "https://mirror.example", - f"audio/{audio_base_path}-vbr3-4a2a58d5.aac", + f"audio/{audio_m4a_path}", ), - "path": f"{audio_base_path}-vbr3-4a2a58d5.aac", - "type": "audio/aac", + "path": audio_m4a_path, + "type": "audio/mp4", "medium": "audio", "isDefault": "false", "fileSize": "3456", - "bitrate": "88000", + "bitrate": "20746", "duration": "61.2", "samplingrate": "48000", - "channels": "2", + "channels": "1", + }, + { + "url": _published_url( + "https://mirror.example", + f"audio/{audio_webm_path}", + ), + "path": audio_webm_path, + "type": "audio/webm", + "medium": "audio", + "isDefault": "false", + "fileSize": "2345", + "bitrate": "48000", + "duration": "61.2", + "samplingrate": "48000", + "channels": "1", }, { "url": _published_url( @@ -135,10 +171,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: item.videos = [ { "url": source_video, - "path": f"{video_base_path}-720-457f0928.mp4", + "path": video_main_path, "published_url": _published_url( "https://mirror.example", - f"video/{video_base_path}-720-457f0928.mp4", + f"video/{video_main_path}", ), "checksum": "video-default", "status": "downloaded", @@ -146,9 +182,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: { "url": _published_url( "https://mirror.example", - f"video/{video_base_path}-720-457f0928.mp4", + f"video/{video_main_path}", ), - "path": f"{video_base_path}-720-457f0928.mp4", + "path": video_main_path, "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -159,6 +195,22 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "height": "720", "framerate": "30/1", }, + { + "url": _published_url( + "https://mirror.example", + f"video/{video_fallback_path}", + ), + "path": video_fallback_path, + "type": "video/webm", + "medium": "video", + "isDefault": "false", + "fileSize": "6789", + "bitrate": "64000", + "duration": "60.0", + "width": "1280", + "height": "720", + "framerate": "25/1", + }, { "url": _published_url( "https://mirror.example", @@ -257,12 +309,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: enclosure = root.find("./channel/item/enclosure") assert enclosure is not None assert enclosure.attrib == { - "url": ( - f"https://mirror.example/feeds/demo/audio/" - f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3" - ), + "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"), "length": "4567", - "type": "audio/mp3", + "type": "audio/mpeg", } assert len(enclosure) == 0 @@ -276,32 +325,39 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert [variant.attrib for variant in audio_variants] == [ { "url": ( - f"https://mirror.example/feeds/demo/audio/" - f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3" + f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}" ), - "type": "audio/mp3", + "type": "audio/mpeg", "medium": "audio", "isDefault": "true", - "bitrate": "96000", - "samplingrate": "44100", - "channels": "2", + "bitrate": "37209", + "samplingrate": "48000", + "channels": "1", "duration": "61.2", "fileSize": "4567", }, { - "url": ( - f"https://mirror.example/feeds/demo/audio/" - f"{local_audio_path(source_audio)}-vbr3-4a2a58d5.aac" - ), - "type": "audio/aac", + "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_m4a_path}"), + "type": "audio/mp4", "medium": "audio", "isDefault": "false", - "bitrate": "88000", + "bitrate": "20746", "samplingrate": "48000", - "channels": "2", + "channels": "1", "duration": "61.2", "fileSize": "3456", }, + { + "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_webm_path}"), + "type": "audio/webm", + "medium": "audio", + "isDefault": "false", + "bitrate": "48000", + "samplingrate": "48000", + "channels": "1", + "duration": "61.2", + "fileSize": "2345", + }, { "url": ( f"https://mirror.example/feeds/demo/audio/" @@ -321,10 +377,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: video_variants = video_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in video_variants] == [ { - "url": ( - f"https://mirror.example/feeds/demo/video/" - f"{local_video_path(source_video)}-720-457f0928.mp4" - ), + "url": (f"https://mirror.example/feeds/demo/video/" f"{video_main_path}"), "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -337,6 +390,22 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "lang": "en", "fileSize": "9876", }, + { + "url": ( + f"https://mirror.example/feeds/demo/video/" f"{video_fallback_path}" + ), + "type": "video/webm", + "medium": "video", + "isDefault": "false", + "expression": "full", + "bitrate": "64000", + "framerate": "25/1", + "duration": "60.0", + "height": "720", + "width": "1280", + "lang": "en", + "fileSize": "6789", + }, { "url": ( f"https://mirror.example/feeds/demo/video/" diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index 66246f3..ff43b6a 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -71,16 +71,22 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: "https://example.com/media/podcast.mp3", ) == ( "audio/" - f"{local_audio_path('https://example.com/media/podcast.mp3')}" - "-vbr7-3b2b0f13.mp3" + + published_media_path( + FileType.AUDIO, + "https://example.com/media/podcast.mp3", + repub_settings.REPUBLISHER_AUDIO[0], + ) ) assert spider.rewrite_file_url( FileType.VIDEO, "https://example.com/media/clip.mp4", ) == ( "video/" - f"{local_video_path('https://example.com/media/clip.mp4')}" - "-720-457f0928.mp4" + + published_media_path( + FileType.VIDEO, + "https://example.com/media/clip.mp4", + repub_settings.REPUBLISHER_VIDEO[0], + ) ) @@ -90,10 +96,10 @@ def test_published_media_path_changes_when_profile_args_change() -> None: base_profile = repub_settings.REPUBLISHER_VIDEO[0] assert published_media_path(FileType.AUDIO, source_url, audio_profile) == ( - f"{local_audio_path(source_url)}-vbr7-3b2b0f13.mp3" + f"{local_audio_path(source_url)}-mp3_vbr7_voice-1cc131cf.mp3" ) assert published_media_path(FileType.VIDEO, source_url, base_profile) == ( - f"{local_video_path(source_url)}-720-457f0928.mp4" + f"{local_video_path(source_url)}-main-4fb03ba0.mp4" ) changed_audio_profile = {**audio_profile, "max_bitrate": 128000} diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index e82672b..523f9bd 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -8,6 +8,7 @@ from scrapy.crawler import Crawler from scrapy.http import Request, Response from repub import media +from repub import settings as repub_settings from repub.config import ( FeedConfig, RepublisherConfig, @@ -16,7 +17,12 @@ from repub.config import ( ) from repub.items import ElementItem from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline -from repub.utils import local_audio_path, local_video_path +from repub.utils import ( + FileType, + local_audio_path, + local_video_path, + published_media_path, +) def build_test_crawler(tmp_path: Path) -> SimpleNamespace: @@ -309,6 +315,103 @@ def test_video_transcode_params_scales_to_max_height_for_multipass() -> None: } +def test_audio_transcode_params_accepts_m4a_format_family() -> None: + params = media.audio_transcode_params( + { + "format": { + "bit_rate": "20000", + "format_name": "mov,mp4,m4a,3gp,3g2,mj2", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "aac", + "bit_rate": "20000", + "duration_ts": "1", + } + ], + }, + cast( + media.AudioSettings, + { + "name": "m4a", + "format": "m4a", + "max_bitrate": 64000, + "mimetype": "audio/mp4", + "extension": "m4a", + "ffmpeg_audio_params": { + "acodec": "libfdk_aac", + "vbr": "2", + }, + }, + ), + ) + + assert params is None + + +def test_audio_meta_handles_webm_without_duration_ts() -> None: + assert media.audio_meta( + { + "format": { + "duration": "1.0", + "size": "100", + "bit_rate": "48000", + "format_name": "matroska,webm", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "opus", + "sample_rate": "48000", + "channels": 1, + } + ], + } + ) == { + "duration": "1.0", + "fileSize": "100", + "bitrate": 48000, + "samplingrate": 48000, + "channels": 1, + } + + +def test_video_meta_handles_webm_without_duration_ts() -> None: + assert media.video_meta( + { + "format": { + "duration": "1.0", + "size": "200", + "bit_rate": "64000", + "format_name": "matroska,webm", + }, + "streams": [ + { + "codec_type": "video", + "codec_name": "vp9", + "width": 640, + "height": 360, + "avg_frame_rate": "25/1", + }, + { + "codec_type": "audio", + "codec_name": "opus", + "sample_rate": "48000", + "channels": 1, + }, + ], + } + ) == { + "duration": "1.0", + "fileSize": "200", + "width": 640, + "height": 360, + "bitrate": 64000, + "framerate": "25/1", + } + + def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants( monkeypatch, tmp_path: Path ) -> None: @@ -337,13 +440,24 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant output_path.write_bytes(settings["name"].encode("utf-8")) return str(output_path) + audio_default_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0] + ) + audio_m4a_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1] + ) + audio_webm_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2] + ) + def fake_probe_media(file_path: str): - if file_path.endswith(".mp3-vbr7-3b2b0f13.mp3"): + file_name = Path(file_path).name + if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3": return { "format": { "duration": "61.2", "size": "4567", - "bit_rate": "96000", + "bit_rate": "37209", "format_name": "mp3", "format_long_name": "MP3", }, @@ -351,48 +465,69 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant { "codec_type": "audio", "codec_name": "mp3", - "bit_rate": "96000", + "bit_rate": "37209", "duration_ts": "61200", - "sample_rate": "44100", - "channels": 2, + "sample_rate": "48000", + "channels": 1, } ], } - if file_path.endswith(".mp3"): + if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a": return { "format": { "duration": "61.2", - "size": "5678", - "bit_rate": "128000", - "format_name": "mp3", - "format_long_name": "MP3", + "size": "3456", + "bit_rate": "20746", + "format_name": "mov,mp4,m4a,3gp,3g2,mj2", + "format_long_name": "AAC", }, "streams": [ { "codec_type": "audio", - "codec_name": "mp3", - "bit_rate": "128000", + "codec_name": "aac", + "bit_rate": "20746", "duration_ts": "61200", - "sample_rate": "44100", - "channels": 2, + "sample_rate": "48000", + "channels": 1, + } + ], + } + if ( + file_path.endswith(audio_webm_path) + or file_name == "webm_opus_voice_48k.webm" + ): + return { + "format": { + "duration": "61.2", + "size": "2345", + "bit_rate": "48000", + "format_name": "matroska,webm", + "format_long_name": "WebM", + }, + "streams": [ + { + "codec_type": "audio", + "codec_name": "opus", + "sample_rate": "48000", + "channels": 1, } ], } return { "format": { "duration": "61.2", - "size": "3456", - "bit_rate": "88000", - "format_name": "aac", - "format_long_name": "AAC", + "size": "5678", + "bit_rate": "128000", + "format_name": "mp3", + "format_long_name": "MP3", }, "streams": [ { "codec_type": "audio", - "codec_name": "aac", - "bit_rate": "88000", + "codec_name": "mp3", + "bit_rate": "128000", "duration_ts": "61200", - "sample_rate": "48000", + "sample_rate": "44100", "channels": 2, } ], @@ -423,43 +558,48 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert isinstance(result["checksum"], str) assert result == { "url": source_url, - "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", + "path": audio_default_path, "published_url": ( - "https://mirror.example/feeds/nasa/audio/" - f"{audio_base_path}-vbr7-3b2b0f13.mp3" + f"https://mirror.example/feeds/nasa/audio/{audio_default_path}" ), "checksum": result["checksum"], "status": "downloaded", "variants": [ { - "url": ( - "https://mirror.example/feeds/nasa/audio/" - f"{audio_base_path}-vbr7-3b2b0f13.mp3" - ), - "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", - "type": "audio/mp3", + "url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}", + "path": audio_default_path, + "type": "audio/mpeg", "medium": "audio", "isDefault": "true", "fileSize": "4567", - "bitrate": 96000, + "bitrate": 37209, "duration": "61.2", - "samplingrate": 44100, - "channels": 2, + "samplingrate": 48000, + "channels": 1, }, { - "url": ( - "https://mirror.example/feeds/nasa/audio/" - f"{audio_base_path}-vbr3-4a2a58d5.aac" - ), - "path": f"{audio_base_path}-vbr3-4a2a58d5.aac", - "type": "audio/aac", + "url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}", + "path": audio_m4a_path, + "type": "audio/mp4", "medium": "audio", "isDefault": "false", "fileSize": "3456", - "bitrate": 88000, + "bitrate": 20746, "duration": "61.2", "samplingrate": 48000, - "channels": 2, + "channels": 1, + }, + { + "url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}", + "path": audio_webm_path, + "type": "audio/webm", + "medium": "audio", + "isDefault": "false", + "fileSize": "2345", + "bitrate": 48000, + "duration": "61.2", + "samplingrate": 48000, + "channels": 1, }, { "url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}", @@ -477,8 +617,9 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant } assert persisted == [ (audio_base_path, "audio/mpeg"), - (f"{audio_base_path}-vbr7-3b2b0f13.mp3", "audio/mp3"), - (f"{audio_base_path}-vbr3-4a2a58d5.aac", "audio/aac"), + (audio_default_path, "audio/mpeg"), + (audio_m4a_path, "audio/mp4"), + (audio_webm_path, "audio/webm"), ] completed_item = pipeline.item_completed( @@ -518,23 +659,70 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant return str(output_path) monkeypatch.setattr(pipeline, "transcode", fake_transcode) - transcoded_suffix = "-720-457f0928.mp4" - monkeypatch.setattr( - media, - "probe_media", - lambda _: { + video_main_path = published_media_path( + FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0] + ) + video_fallback_path = published_media_path( + FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1] + ) + + def fake_probe_media(file_path: str): + file_name = Path(file_path).name + if file_path.endswith(video_main_path) or file_name == "main.mp4": + return { + "format": { + "duration": "60.0", + "size": "9876", + "bit_rate": "123456", + "format_name": "mp4", + "format_long_name": "MP4", + }, + "streams": [ + { + "codec_type": "video", + "codec_name": "h264", + "bit_rate": "123456", + "duration_ts": "60000", + "width": 1280, + "height": 720, + "avg_frame_rate": "30/1", + }, + { + "codec_type": "audio", + "codec_name": "aac", + "bit_rate": "96000", + "duration_ts": "60000", + }, + ], + } + if file_path.endswith(video_fallback_path) or file_name == "fallback.webm": + return { + "format": { + "duration": "60.0", + "size": "6789", + "bit_rate": "64000", + "format_name": "matroska,webm", + "format_long_name": "WebM", + }, + "streams": [ + { + "codec_type": "video", + "codec_name": "vp9", + "width": 1280, + "height": 720, + "avg_frame_rate": "25/1", + }, + { + "codec_type": "audio", + "codec_name": "opus", + }, + ], + } + return { "format": { "duration": "60.0", - "size": ( - "12345" - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else "9876" - ), - "bit_rate": ( - "456789" - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else "123456" - ), + "size": "12345", + "bit_rate": "456789", "format_name": "mp4", "format_long_name": "MP4", }, @@ -542,27 +730,11 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant { "codec_type": "video", "codec_name": "h264", - "bit_rate": ( - "456789" - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else "123456" - ), + "bit_rate": "456789", "duration_ts": "60000", - "width": ( - 640 - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else 1280 - ), - "height": ( - 360 - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else 720 - ), - "avg_frame_rate": ( - "24/1" - if _.endswith(".mp4") and not _.endswith(transcoded_suffix) - else "30/1" - ), + "width": 640, + "height": 360, + "avg_frame_rate": "24/1", }, { "codec_type": "audio", @@ -571,8 +743,9 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant "duration_ts": "60000", }, ], - }, - ) + } + + monkeypatch.setattr(media, "probe_media", fake_probe_media) def fake_persist_file(path, buf, info, meta=None, headers=None): del info, meta @@ -596,20 +769,14 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert isinstance(result["checksum"], str) assert result == { "url": source_url, - "path": f"{video_base_path}-720-457f0928.mp4", - "published_url": ( - "https://mirror.example/feeds/nasa/video/" - f"{video_base_path}-720-457f0928.mp4" - ), + "path": video_main_path, + "published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"), "checksum": result["checksum"], "status": "downloaded", "variants": [ { - "url": ( - "https://mirror.example/feeds/nasa/video/" - f"{video_base_path}-720-457f0928.mp4" - ), - "path": f"{video_base_path}-720-457f0928.mp4", + "url": f"https://mirror.example/feeds/nasa/video/{video_main_path}", + "path": video_main_path, "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -620,6 +787,19 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant "height": 720, "framerate": "30/1", }, + { + "url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}", + "path": video_fallback_path, + "type": "video/webm", + "medium": "video", + "isDefault": "false", + "fileSize": "6789", + "bitrate": 64000, + "duration": "60.0", + "width": 1280, + "height": 720, + "framerate": "25/1", + }, { "url": f"https://mirror.example/feeds/nasa/video/{video_base_path}", "path": video_base_path, @@ -637,7 +817,8 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant } assert persisted == [ (video_base_path, "video/mp4"), - (f"{video_base_path}-720-457f0928.mp4", "video/mp4"), + (video_main_path, "video/mp4"), + (video_fallback_path, "video/webm"), ] @@ -650,13 +831,24 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( source_url = "https://example.com/podcast.mp3" audio_base_path = local_audio_path(source_url) original_path = store_dir(pipeline) / audio_base_path - canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7-3b2b0f13.mp3" - secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3-4a2a58d5.aac" + audio_default_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0] + ) + audio_m4a_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1] + ) + audio_webm_path = published_media_path( + FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2] + ) + canonical_path = store_dir(pipeline) / audio_default_path + m4a_path = store_dir(pipeline) / audio_m4a_path + webm_path = store_dir(pipeline) / audio_webm_path original_path.parent.mkdir(parents=True, exist_ok=True) original_path.write_bytes(b"original") canonical_path.parent.mkdir(parents=True, exist_ok=True) canonical_path.write_bytes(b"default") - secondary_path.write_bytes(b"alt") + m4a_path.write_bytes(b"alt-aac") + webm_path.write_bytes(b"alt-webm") stat_paths: list[str] = [] original_stat_file = pipeline.store.stat_file item = ElementItem( @@ -683,12 +875,38 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( lambda file_path: { "format": { "duration": "61.2", - "size": ("4567" if file_path.endswith("vbr7-3b2b0f13.mp3") else "3456"), + "size": ( + "4567" + if file_path.endswith(audio_default_path) + else ( + "3456" + if file_path.endswith(audio_m4a_path) + else "2345" if file_path.endswith(audio_webm_path) else "5678" + ) + ), "bit_rate": ( - "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000" + "37209" + if file_path.endswith(audio_default_path) + else ( + "20746" + if file_path.endswith(audio_m4a_path) + else ( + "48000" if file_path.endswith(audio_webm_path) else "128000" + ) + ) ), "format_name": ( - "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac" + "mp3" + if file_path.endswith(audio_default_path) + else ( + "mov,mp4,m4a,3gp,3g2,mj2" + if file_path.endswith(audio_m4a_path) + else ( + "matroska,webm" + if file_path.endswith(audio_webm_path) + else "mp3" + ) + ) ), "format_long_name": "Audio", }, @@ -696,16 +914,36 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( { "codec_type": "audio", "codec_name": ( - "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac" + "mp3" + if file_path.endswith(audio_default_path) + else ( + "aac" + if file_path.endswith(audio_m4a_path) + else ( + "opus" if file_path.endswith(audio_webm_path) else "mp3" + ) + ) ), "bit_rate": ( - "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000" + "37209" + if file_path.endswith(audio_default_path) + else ( + "20746" + if file_path.endswith(audio_m4a_path) + else ( + None + if file_path.endswith(audio_webm_path) + else "128000" + ) + ) + ), + "duration_ts": ( + None if file_path.endswith(audio_webm_path) else "61200" ), - "duration_ts": "61200", "sample_rate": ( - "44100" if file_path.endswith("vbr7-3b2b0f13.mp3") else "48000" + "44100" if file_path == str(original_path) else "48000" ), - "channels": 2, + "channels": 1 if file_path != str(original_path) else 2, } ], }, @@ -717,12 +955,13 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( item=item, ) assert result is not None - assert result["path"] == f"{audio_base_path}-vbr7-3b2b0f13.mp3" + assert result["path"] == audio_default_path assert result["status"] == "uptodate" assert [variant.get("path") for variant in result["variants"]] == [ - f"{audio_base_path}-vbr7-3b2b0f13.mp3", - f"{audio_base_path}-vbr3-4a2a58d5.aac", + audio_default_path, + audio_m4a_path, + audio_webm_path, audio_base_path, ] assert f"{audio_base_path}.mp3" not in stat_paths - assert stat_paths[0] == f"{audio_base_path}-vbr7-3b2b0f13.mp3" + assert stat_paths[0] == audio_default_path