diff --git a/repub/media.py b/repub/media.py index aec3a4d..fe8074b 100644 --- a/repub/media.py +++ b/repub/media.py @@ -77,7 +77,7 @@ def probe_media(file_path) -> Dict[str, Any]: def bitrate(info) -> float: try: return int(info["format"]["bit_rate"]) - except (KeyError, ValueError): + except KeyError | ValueError: logger.error("extracting bitrate from ffprobe failed") return math.inf @@ -85,34 +85,16 @@ def bitrate(info) -> float: def format_name(info) -> Optional[str]: try: return info["format"]["format_name"] - except (KeyError, ValueError): + except KeyError | ValueError: logger.error("extracting format from ffprobe failed") return None -def _stream_duration_sort_key(stream: Dict[str, Any]) -> tuple[int, float]: - duration_ts = _int_value(stream.get("duration_ts")) - if duration_ts is not None: - return 1, float(duration_ts) - try: - duration = float(str(stream.get("duration", ""))) - except (TypeError, ValueError): - duration = 0.0 - return 0, duration - - -def _matches_format(probe: Dict[str, Any], expected: str) -> bool: - current = format_name(probe) - if current is None: - return False - return expected in current.split(",") - - def primary_video_stream(probe): video_streams = [ stream for stream in probe["streams"] if stream["codec_type"] == "video" ] - video_streams = sorted(video_streams, key=_stream_duration_sort_key, reverse=True) + video_streams = sorted(video_streams, key=lambda x: x["duration_ts"], reverse=True) if not video_streams: return None if len(video_streams) > 1: @@ -126,7 +108,7 @@ def primary_audio_stream(probe): audio_streams = [ stream for stream in probe["streams"] if stream["codec_type"] == "audio" ] - audio_streams = sorted(audio_streams, key=_stream_duration_sort_key, reverse=True) + audio_streams = sorted(audio_streams, key=lambda x: x["duration_ts"], reverse=True) if not audio_streams: return None if len(audio_streams) > 1: @@ -144,7 +126,7 @@ def get_resolution(probe) -> Tuple[Optional[float], Optional[float]]: width = int(video_stream["width"]) height = int(video_stream["height"]) return width, height - except (KeyError, ValueError): + except KeyError | ValueError: logger.error("extracting resolution from ffprobe failed") return None, None @@ -155,7 +137,7 @@ def get_vcodec_name(probe) -> Optional[str]: if not video_stream: return None return video_stream["codec_name"] - except (KeyError, ValueError): + except KeyError | ValueError: logger.error("extracting video codec_name from ffprobe failed") return None @@ -165,11 +147,8 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]: audio_stream = primary_audio_stream(probe) if not audio_stream: return None, None - audio_bitrate = _int_value( - audio_stream.get("bit_rate") or probe["format"].get("bit_rate") - ) - return audio_stream["codec_name"], audio_bitrate - except (KeyError, ValueError): + return audio_stream["codec_name"], int(audio_stream["bit_rate"]) + except KeyError | ValueError: logger.error("extracting audio codec_name from ffprobe failed") return None, None @@ -239,7 +218,7 @@ def audio_transcode_params( is_br = True else: is_br = False - if _matches_format(probe_result, fmt): + if format_name(probe_result) == fmt: is_fmt = True else: is_fmt = False @@ -310,7 +289,11 @@ def video_transcode_params( # TODO: turn this into an exception and catch it for reporting return None - is_container = _matches_format(probe_result, target_container) + current_container_many = format_name(probe_result) + is_container = False + if current_container_many is not None: + if target_container in current_container_many.split(","): + is_container = True is_vcodec = vcodec == target_vcodec is_acodec = acodec == target_acodec diff --git a/repub/settings.py b/repub/settings.py index 252c974..d39b635 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -102,95 +102,79 @@ MEDIA_ALLOW_REDIRECTS = True REPUBLISHER_AUDIO = [ { - "name": "mp3_vbr7_voice", + "name": "vbr7", "format": "mp3", - "max_bitrate": 64000, - "mimetype": "audio/mpeg", + "max_bitrate": 96000, + "mimetype": "audio/mp3", "extension": "mp3", "ffmpeg_audio_params": { "acodec": "libmp3lame", + # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding "qscale:a": "7", - "ac": "1", - "ar": "48000", }, }, { - "name": "m4a_aac_vbr2_voice", - "format": "m4a", - "max_bitrate": 64000, - "mimetype": "audio/mp4", - "extension": "m4a", + "name": "vbr3", + "format": "aac", + "max_bitrate": 96000, + "mimetype": "audio/aac", + "extension": "aac", "ffmpeg_audio_params": { "acodec": "libfdk_aac", - "vbr": "2", - "ac": "1", - "ar": "48000", - }, - }, - { - "name": "webm_opus_voice_48k", - "format": "webm", - "max_bitrate": 48000, - "mimetype": "audio/webm", - "extension": "webm", - "ffmpeg_audio_params": { - "acodec": "libopus", - "b:a": "48k", - "ac": "1", - "ar": "48000", - "application": "voip", + # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding + "vbr": "3", }, }, ] REPUBLISHER_VIDEO = [ - # broadly compatible { - "name": "main", + "name": "720", "container": "mp4", "vcodec": "h264", - "acodec": "aac", + "acodec": "mp3", "audio_max_bitrate": 96000, "ffmpeg_audio_params": { - "acodec": "aac", - "b:a": "96k", - "ac": "2", - "ar": "48000", - }, - "ffmpeg_video_params": { - "vcodec": "libx264", - "pix_fmt": "yuv420p", - "profile:v": "main", - "level": "4.0", - "preset": "medium", - "crf": "22", - "movflags": "+faststart", + "acodec": "libmp3lame", + # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding + "qscale:a": "7", }, + "ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"}, "max_height": 720, "mimetype": "video/mp4", "extension": "mp4", }, - # linux fallback without patent encumberance - { - "name": "fallback", - "container": "webm", - "vcodec": "vp9", - "acodec": "opus", - "audio_max_bitrate": 96000, - "ffmpeg_audio_params": { - "acodec": "libopus", - "b:a": "96k", - "ac": "2", - "ar": "48000", - }, - "ffmpeg_video_params": { - "vcodec": "libvpx-vp9", - "crf": "33", - "b:v": "0", - }, - "max_height": 720, - "mimetype": "video/webm", - "extension": "webm", - }, + # { + # "passes": [ + # { + # "c:v": "libvpx-vp9", + # "b:v": "0", + # "crf": "30", + # "pass": "1", + # "deadline": "good", + # "row-mt": "1", + # "f": "null", + # }, + # { + # "c:v": "libvpx-vp9", + # "b:v": "0", + # "crf": "30", + # "pass": "2", + # "deadline": "good", + # "row-mt": "1", + # "c:a": "libopus", + # "b:a": "96k", + # "ac": "2", + # }, + # ], + # "name": "720", + # "container": "webm", + # "vcodec": "libvpx-vp9", + # "acodec": "opus", + # "audio_max_bitrate": 96000, + # "max_height": 720, + # "mimetype": "video/webm", + # "extension": "webm", + # }, ] REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopus"] diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index fa27317..80be20e 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -281,14 +281,6 @@ class RssFeedSpider(BaseRssFeedSpider): file_urls = [] audio_urls = [] video_urls = [] - source_description_html = ( - sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else "" - ) - has_content_html = any( - c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "") - for c in entry.get("content", []) - ) - description_html = source_description_html if has_content_html else "" def add_url(file_type, url): if file_type == FileType.IMAGE: @@ -303,7 +295,7 @@ class RssFeedSpider(BaseRssFeedSpider): item = E.item( E.title(entry.get("title")), E.link(entry.get("link")), - E.description(description_html), + E.description(sanitize_html(entry.get("description", ""))), E.guid( entry.get("id"), {"isPermaLink": "true" if entry.guidislink else "false"}, @@ -349,8 +341,6 @@ class RssFeedSpider(BaseRssFeedSpider): image_urls.extend(urls[FileType.IMAGE]) video_urls.extend(urls[FileType.VIDEO]) audio_urls.extend(urls[FileType.AUDIO]) - if not has_content_html and source_description_html.strip() != "": - item.append(CONTENT.encoded(CDATA(source_description_html))) if isinstance(entry.get("media_content"), list): for media in ( diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 9e1f80b..22589b4 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -14,13 +14,7 @@ from repub.exporters import RssExporter from repub.items import ElementItem from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider -from repub.utils import ( - FileType, - local_audio_path, - local_image_path, - local_video_path, - published_media_path, -) +from repub.utils import local_audio_path, local_image_path, local_video_path RSS_DATE_PATTERN = re.compile( r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$" @@ -75,32 +69,17 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" - audio_base_path = local_audio_path(source_audio) - audio_default_path = published_media_path( - FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] - ) - audio_m4a_path = published_media_path( - FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1] - ) - audio_webm_path = published_media_path( - FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2] - ) - video_base_path = local_video_path(source_video) - video_main_path = published_media_path( - FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0] - ) - video_fallback_path = published_media_path( - FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1] - ) def prepare_item(item: ElementItem) -> None: + audio_base_path = local_audio_path(source_audio) + video_base_path = local_video_path(source_video) item.audios = [ { "url": source_audio, - "path": audio_default_path, + "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", "published_url": _published_url( "https://mirror.example", - f"audio/{audio_default_path}", + f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", ), "checksum": "audio-default", "status": "downloaded", @@ -108,47 +87,32 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: { "url": _published_url( "https://mirror.example", - f"audio/{audio_default_path}", + f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3", ), - "path": audio_default_path, - "type": "audio/mpeg", + "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", + "type": "audio/mp3", "medium": "audio", "isDefault": "true", "fileSize": "4567", - "bitrate": "37209", + "bitrate": "96000", "duration": "61.2", - "samplingrate": "48000", - "channels": "1", + "samplingrate": "44100", + "channels": "2", }, { "url": _published_url( "https://mirror.example", - f"audio/{audio_m4a_path}", + f"audio/{audio_base_path}-vbr3-4a2a58d5.aac", ), - "path": audio_m4a_path, - "type": "audio/mp4", + "path": f"{audio_base_path}-vbr3-4a2a58d5.aac", + "type": "audio/aac", "medium": "audio", "isDefault": "false", "fileSize": "3456", - "bitrate": "20746", + "bitrate": "88000", "duration": "61.2", "samplingrate": "48000", - "channels": "1", - }, - { - "url": _published_url( - "https://mirror.example", - f"audio/{audio_webm_path}", - ), - "path": audio_webm_path, - "type": "audio/webm", - "medium": "audio", - "isDefault": "false", - "fileSize": "2345", - "bitrate": "48000", - "duration": "61.2", - "samplingrate": "48000", - "channels": "1", + "channels": "2", }, { "url": _published_url( @@ -171,10 +135,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: item.videos = [ { "url": source_video, - "path": video_main_path, + "path": f"{video_base_path}-720-457f0928.mp4", "published_url": _published_url( "https://mirror.example", - f"video/{video_main_path}", + f"video/{video_base_path}-720-457f0928.mp4", ), "checksum": "video-default", "status": "downloaded", @@ -182,9 +146,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: { "url": _published_url( "https://mirror.example", - f"video/{video_main_path}", + f"video/{video_base_path}-720-457f0928.mp4", ), - "path": video_main_path, + "path": f"{video_base_path}-720-457f0928.mp4", "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -195,22 +159,6 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "height": "720", "framerate": "30/1", }, - { - "url": _published_url( - "https://mirror.example", - f"video/{video_fallback_path}", - ), - "path": video_fallback_path, - "type": "video/webm", - "medium": "video", - "isDefault": "false", - "fileSize": "6789", - "bitrate": "64000", - "duration": "60.0", - "width": "1280", - "height": "720", - "framerate": "25/1", - }, { "url": _published_url( "https://mirror.example", @@ -309,9 +257,12 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: enclosure = root.find("./channel/item/enclosure") assert enclosure is not None assert enclosure.attrib == { - "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"), + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3" + ), "length": "4567", - "type": "audio/mpeg", + "type": "audio/mp3", } assert len(enclosure) == 0 @@ -325,39 +276,32 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert [variant.attrib for variant in audio_variants] == [ { "url": ( - f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}" + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3" ), - "type": "audio/mpeg", + "type": "audio/mp3", "medium": "audio", "isDefault": "true", - "bitrate": "37209", - "samplingrate": "48000", - "channels": "1", + "bitrate": "96000", + "samplingrate": "44100", + "channels": "2", "duration": "61.2", "fileSize": "4567", }, { - "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_m4a_path}"), - "type": "audio/mp4", + "url": ( + f"https://mirror.example/feeds/demo/audio/" + f"{local_audio_path(source_audio)}-vbr3-4a2a58d5.aac" + ), + "type": "audio/aac", "medium": "audio", "isDefault": "false", - "bitrate": "20746", + "bitrate": "88000", "samplingrate": "48000", - "channels": "1", + "channels": "2", "duration": "61.2", "fileSize": "3456", }, - { - "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_webm_path}"), - "type": "audio/webm", - "medium": "audio", - "isDefault": "false", - "bitrate": "48000", - "samplingrate": "48000", - "channels": "1", - "duration": "61.2", - "fileSize": "2345", - }, { "url": ( f"https://mirror.example/feeds/demo/audio/" @@ -377,7 +321,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: video_variants = video_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in video_variants] == [ { - "url": (f"https://mirror.example/feeds/demo/video/" f"{video_main_path}"), + "url": ( + f"https://mirror.example/feeds/demo/video/" + f"{local_video_path(source_video)}-720-457f0928.mp4" + ), "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -390,22 +337,6 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: "lang": "en", "fileSize": "9876", }, - { - "url": ( - f"https://mirror.example/feeds/demo/video/" f"{video_fallback_path}" - ), - "type": "video/webm", - "medium": "video", - "isDefault": "false", - "expression": "full", - "bitrate": "64000", - "framerate": "25/1", - "duration": "60.0", - "height": "720", - "width": "1280", - "lang": "en", - "fileSize": "6789", - }, { "url": ( f"https://mirror.example/feeds/demo/video/" @@ -437,60 +368,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert "<" not in itunes_summary assert ">" not in itunes_summary - -def test_item_body_uses_description_only_when_content_is_also_present() -> None: - xml, root = _serialize_feed( - feed_url="https://mirror.example", - feed_text=""" - - - Demo Feed - https://source.example/feed - Demo description - - Description Only - https://source.example/description-only - Description body

]]>
- entry-description-only - Tue, 31 Mar 2026 10:31:50 +0000 -
- - Content Only - https://source.example/content-only - entry-content-only - Tue, 31 Mar 2026 10:31:50 +0000 - Content body]]> - - - Both Present - https://source.example/both-present - Summary body

]]>
- entry-both-present - Tue, 31 Mar 2026 10:31:50 +0000 - Full body]]> -
-
-
-""", - ) - - items = root.findall("./channel/item") - assert len(items) == 3 - - description_only, content_only, both_present = items - - assert description_only.findtext("description") in (None, "") - assert description_only.findtext("content:encoded", namespaces=nsmap) == ( - "

Description body

" - ) - - assert content_only.findtext("description") in (None, "") - assert content_only.findtext("content:encoded", namespaces=nsmap) == ( - "
Content body
" - ) - - assert both_present.findtext("description") == "

Summary body

" - assert both_present.findtext("content:encoded", namespaces=nsmap) == ( - "
Full body
" + assert "contenteditable=" not in xml + assert "mode=" not in xml + assert "querystring=" not in xml + assert ( + f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}" + in xml ) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index ff43b6a..66246f3 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -71,22 +71,16 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: "https://example.com/media/podcast.mp3", ) == ( "audio/" - + published_media_path( - FileType.AUDIO, - "https://example.com/media/podcast.mp3", - repub_settings.REPUBLISHER_AUDIO[0], - ) + f"{local_audio_path('https://example.com/media/podcast.mp3')}" + "-vbr7-3b2b0f13.mp3" ) assert spider.rewrite_file_url( FileType.VIDEO, "https://example.com/media/clip.mp4", ) == ( "video/" - + published_media_path( - FileType.VIDEO, - "https://example.com/media/clip.mp4", - repub_settings.REPUBLISHER_VIDEO[0], - ) + f"{local_video_path('https://example.com/media/clip.mp4')}" + "-720-457f0928.mp4" ) @@ -96,10 +90,10 @@ def test_published_media_path_changes_when_profile_args_change() -> None: base_profile = repub_settings.REPUBLISHER_VIDEO[0] assert published_media_path(FileType.AUDIO, source_url, audio_profile) == ( - f"{local_audio_path(source_url)}-mp3_vbr7_voice-1cc131cf.mp3" + f"{local_audio_path(source_url)}-vbr7-3b2b0f13.mp3" ) assert published_media_path(FileType.VIDEO, source_url, base_profile) == ( - f"{local_video_path(source_url)}-main-4fb03ba0.mp4" + f"{local_video_path(source_url)}-720-457f0928.mp4" ) changed_audio_profile = {**audio_profile, "max_bitrate": 128000} diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 523f9bd..e82672b 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -8,7 +8,6 @@ from scrapy.crawler import Crawler from scrapy.http import Request, Response from repub import media -from repub import settings as repub_settings from repub.config import ( FeedConfig, RepublisherConfig, @@ -17,12 +16,7 @@ from repub.config import ( ) from repub.items import ElementItem from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline -from repub.utils import ( - FileType, - local_audio_path, - local_video_path, - published_media_path, -) +from repub.utils import local_audio_path, local_video_path def build_test_crawler(tmp_path: Path) -> SimpleNamespace: @@ -315,103 +309,6 @@ def test_video_transcode_params_scales_to_max_height_for_multipass() -> None: } -def test_audio_transcode_params_accepts_m4a_format_family() -> None: - params = media.audio_transcode_params( - { - "format": { - "bit_rate": "20000", - "format_name": "mov,mp4,m4a,3gp,3g2,mj2", - }, - "streams": [ - { - "codec_type": "audio", - "codec_name": "aac", - "bit_rate": "20000", - "duration_ts": "1", - } - ], - }, - cast( - media.AudioSettings, - { - "name": "m4a", - "format": "m4a", - "max_bitrate": 64000, - "mimetype": "audio/mp4", - "extension": "m4a", - "ffmpeg_audio_params": { - "acodec": "libfdk_aac", - "vbr": "2", - }, - }, - ), - ) - - assert params is None - - -def test_audio_meta_handles_webm_without_duration_ts() -> None: - assert media.audio_meta( - { - "format": { - "duration": "1.0", - "size": "100", - "bit_rate": "48000", - "format_name": "matroska,webm", - }, - "streams": [ - { - "codec_type": "audio", - "codec_name": "opus", - "sample_rate": "48000", - "channels": 1, - } - ], - } - ) == { - "duration": "1.0", - "fileSize": "100", - "bitrate": 48000, - "samplingrate": 48000, - "channels": 1, - } - - -def test_video_meta_handles_webm_without_duration_ts() -> None: - assert media.video_meta( - { - "format": { - "duration": "1.0", - "size": "200", - "bit_rate": "64000", - "format_name": "matroska,webm", - }, - "streams": [ - { - "codec_type": "video", - "codec_name": "vp9", - "width": 640, - "height": 360, - "avg_frame_rate": "25/1", - }, - { - "codec_type": "audio", - "codec_name": "opus", - "sample_rate": "48000", - "channels": 1, - }, - ], - } - ) == { - "duration": "1.0", - "fileSize": "200", - "width": 640, - "height": 360, - "bitrate": 64000, - "framerate": "25/1", - } - - def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants( monkeypatch, tmp_path: Path ) -> None: @@ -440,24 +337,13 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant output_path.write_bytes(settings["name"].encode("utf-8")) return str(output_path) - audio_default_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0] - ) - audio_m4a_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1] - ) - audio_webm_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2] - ) - def fake_probe_media(file_path: str): - file_name = Path(file_path).name - if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3": + if file_path.endswith(".mp3-vbr7-3b2b0f13.mp3"): return { "format": { "duration": "61.2", "size": "4567", - "bit_rate": "37209", + "bit_rate": "96000", "format_name": "mp3", "format_long_name": "MP3", }, @@ -465,69 +351,48 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant { "codec_type": "audio", "codec_name": "mp3", - "bit_rate": "37209", + "bit_rate": "96000", "duration_ts": "61200", - "sample_rate": "48000", - "channels": 1, + "sample_rate": "44100", + "channels": 2, } ], } - if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a": + if file_path.endswith(".mp3"): return { "format": { "duration": "61.2", - "size": "3456", - "bit_rate": "20746", - "format_name": "mov,mp4,m4a,3gp,3g2,mj2", - "format_long_name": "AAC", + "size": "5678", + "bit_rate": "128000", + "format_name": "mp3", + "format_long_name": "MP3", }, "streams": [ { "codec_type": "audio", - "codec_name": "aac", - "bit_rate": "20746", + "codec_name": "mp3", + "bit_rate": "128000", "duration_ts": "61200", - "sample_rate": "48000", - "channels": 1, - } - ], - } - if ( - file_path.endswith(audio_webm_path) - or file_name == "webm_opus_voice_48k.webm" - ): - return { - "format": { - "duration": "61.2", - "size": "2345", - "bit_rate": "48000", - "format_name": "matroska,webm", - "format_long_name": "WebM", - }, - "streams": [ - { - "codec_type": "audio", - "codec_name": "opus", - "sample_rate": "48000", - "channels": 1, + "sample_rate": "44100", + "channels": 2, } ], } return { "format": { "duration": "61.2", - "size": "5678", - "bit_rate": "128000", - "format_name": "mp3", - "format_long_name": "MP3", + "size": "3456", + "bit_rate": "88000", + "format_name": "aac", + "format_long_name": "AAC", }, "streams": [ { "codec_type": "audio", - "codec_name": "mp3", - "bit_rate": "128000", + "codec_name": "aac", + "bit_rate": "88000", "duration_ts": "61200", - "sample_rate": "44100", + "sample_rate": "48000", "channels": 2, } ], @@ -558,48 +423,43 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert isinstance(result["checksum"], str) assert result == { "url": source_url, - "path": audio_default_path, + "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", "published_url": ( - f"https://mirror.example/feeds/nasa/audio/{audio_default_path}" + "https://mirror.example/feeds/nasa/audio/" + f"{audio_base_path}-vbr7-3b2b0f13.mp3" ), "checksum": result["checksum"], "status": "downloaded", "variants": [ { - "url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}", - "path": audio_default_path, - "type": "audio/mpeg", + "url": ( + "https://mirror.example/feeds/nasa/audio/" + f"{audio_base_path}-vbr7-3b2b0f13.mp3" + ), + "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3", + "type": "audio/mp3", "medium": "audio", "isDefault": "true", "fileSize": "4567", - "bitrate": 37209, + "bitrate": 96000, "duration": "61.2", - "samplingrate": 48000, - "channels": 1, + "samplingrate": 44100, + "channels": 2, }, { - "url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}", - "path": audio_m4a_path, - "type": "audio/mp4", + "url": ( + "https://mirror.example/feeds/nasa/audio/" + f"{audio_base_path}-vbr3-4a2a58d5.aac" + ), + "path": f"{audio_base_path}-vbr3-4a2a58d5.aac", + "type": "audio/aac", "medium": "audio", "isDefault": "false", "fileSize": "3456", - "bitrate": 20746, + "bitrate": 88000, "duration": "61.2", "samplingrate": 48000, - "channels": 1, - }, - { - "url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}", - "path": audio_webm_path, - "type": "audio/webm", - "medium": "audio", - "isDefault": "false", - "fileSize": "2345", - "bitrate": 48000, - "duration": "61.2", - "samplingrate": 48000, - "channels": 1, + "channels": 2, }, { "url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}", @@ -617,9 +477,8 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant } assert persisted == [ (audio_base_path, "audio/mpeg"), - (audio_default_path, "audio/mpeg"), - (audio_m4a_path, "audio/mp4"), - (audio_webm_path, "audio/webm"), + (f"{audio_base_path}-vbr7-3b2b0f13.mp3", "audio/mp3"), + (f"{audio_base_path}-vbr3-4a2a58d5.aac", "audio/aac"), ] completed_item = pipeline.item_completed( @@ -659,70 +518,23 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant return str(output_path) monkeypatch.setattr(pipeline, "transcode", fake_transcode) - video_main_path = published_media_path( - FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0] - ) - video_fallback_path = published_media_path( - FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1] - ) - - def fake_probe_media(file_path: str): - file_name = Path(file_path).name - if file_path.endswith(video_main_path) or file_name == "main.mp4": - return { - "format": { - "duration": "60.0", - "size": "9876", - "bit_rate": "123456", - "format_name": "mp4", - "format_long_name": "MP4", - }, - "streams": [ - { - "codec_type": "video", - "codec_name": "h264", - "bit_rate": "123456", - "duration_ts": "60000", - "width": 1280, - "height": 720, - "avg_frame_rate": "30/1", - }, - { - "codec_type": "audio", - "codec_name": "aac", - "bit_rate": "96000", - "duration_ts": "60000", - }, - ], - } - if file_path.endswith(video_fallback_path) or file_name == "fallback.webm": - return { - "format": { - "duration": "60.0", - "size": "6789", - "bit_rate": "64000", - "format_name": "matroska,webm", - "format_long_name": "WebM", - }, - "streams": [ - { - "codec_type": "video", - "codec_name": "vp9", - "width": 1280, - "height": 720, - "avg_frame_rate": "25/1", - }, - { - "codec_type": "audio", - "codec_name": "opus", - }, - ], - } - return { + transcoded_suffix = "-720-457f0928.mp4" + monkeypatch.setattr( + media, + "probe_media", + lambda _: { "format": { "duration": "60.0", - "size": "12345", - "bit_rate": "456789", + "size": ( + "12345" + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else "9876" + ), + "bit_rate": ( + "456789" + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else "123456" + ), "format_name": "mp4", "format_long_name": "MP4", }, @@ -730,11 +542,27 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant { "codec_type": "video", "codec_name": "h264", - "bit_rate": "456789", + "bit_rate": ( + "456789" + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else "123456" + ), "duration_ts": "60000", - "width": 640, - "height": 360, - "avg_frame_rate": "24/1", + "width": ( + 640 + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else 1280 + ), + "height": ( + 360 + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else 720 + ), + "avg_frame_rate": ( + "24/1" + if _.endswith(".mp4") and not _.endswith(transcoded_suffix) + else "30/1" + ), }, { "codec_type": "audio", @@ -743,9 +571,8 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant "duration_ts": "60000", }, ], - } - - monkeypatch.setattr(media, "probe_media", fake_probe_media) + }, + ) def fake_persist_file(path, buf, info, meta=None, headers=None): del info, meta @@ -769,14 +596,20 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert isinstance(result["checksum"], str) assert result == { "url": source_url, - "path": video_main_path, - "published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"), + "path": f"{video_base_path}-720-457f0928.mp4", + "published_url": ( + "https://mirror.example/feeds/nasa/video/" + f"{video_base_path}-720-457f0928.mp4" + ), "checksum": result["checksum"], "status": "downloaded", "variants": [ { - "url": f"https://mirror.example/feeds/nasa/video/{video_main_path}", - "path": video_main_path, + "url": ( + "https://mirror.example/feeds/nasa/video/" + f"{video_base_path}-720-457f0928.mp4" + ), + "path": f"{video_base_path}-720-457f0928.mp4", "type": "video/mp4", "medium": "video", "isDefault": "true", @@ -787,19 +620,6 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant "height": 720, "framerate": "30/1", }, - { - "url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}", - "path": video_fallback_path, - "type": "video/webm", - "medium": "video", - "isDefault": "false", - "fileSize": "6789", - "bitrate": 64000, - "duration": "60.0", - "width": 1280, - "height": 720, - "framerate": "25/1", - }, { "url": f"https://mirror.example/feeds/nasa/video/{video_base_path}", "path": video_base_path, @@ -817,8 +637,7 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant } assert persisted == [ (video_base_path, "video/mp4"), - (video_main_path, "video/mp4"), - (video_fallback_path, "video/webm"), + (f"{video_base_path}-720-457f0928.mp4", "video/mp4"), ] @@ -831,24 +650,13 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( source_url = "https://example.com/podcast.mp3" audio_base_path = local_audio_path(source_url) original_path = store_dir(pipeline) / audio_base_path - audio_default_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0] - ) - audio_m4a_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1] - ) - audio_webm_path = published_media_path( - FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2] - ) - canonical_path = store_dir(pipeline) / audio_default_path - m4a_path = store_dir(pipeline) / audio_m4a_path - webm_path = store_dir(pipeline) / audio_webm_path + canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7-3b2b0f13.mp3" + secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3-4a2a58d5.aac" original_path.parent.mkdir(parents=True, exist_ok=True) original_path.write_bytes(b"original") canonical_path.parent.mkdir(parents=True, exist_ok=True) canonical_path.write_bytes(b"default") - m4a_path.write_bytes(b"alt-aac") - webm_path.write_bytes(b"alt-webm") + secondary_path.write_bytes(b"alt") stat_paths: list[str] = [] original_stat_file = pipeline.store.stat_file item = ElementItem( @@ -875,38 +683,12 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( lambda file_path: { "format": { "duration": "61.2", - "size": ( - "4567" - if file_path.endswith(audio_default_path) - else ( - "3456" - if file_path.endswith(audio_m4a_path) - else "2345" if file_path.endswith(audio_webm_path) else "5678" - ) - ), + "size": ("4567" if file_path.endswith("vbr7-3b2b0f13.mp3") else "3456"), "bit_rate": ( - "37209" - if file_path.endswith(audio_default_path) - else ( - "20746" - if file_path.endswith(audio_m4a_path) - else ( - "48000" if file_path.endswith(audio_webm_path) else "128000" - ) - ) + "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000" ), "format_name": ( - "mp3" - if file_path.endswith(audio_default_path) - else ( - "mov,mp4,m4a,3gp,3g2,mj2" - if file_path.endswith(audio_m4a_path) - else ( - "matroska,webm" - if file_path.endswith(audio_webm_path) - else "mp3" - ) - ) + "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac" ), "format_long_name": "Audio", }, @@ -914,36 +696,16 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( { "codec_type": "audio", "codec_name": ( - "mp3" - if file_path.endswith(audio_default_path) - else ( - "aac" - if file_path.endswith(audio_m4a_path) - else ( - "opus" if file_path.endswith(audio_webm_path) else "mp3" - ) - ) + "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac" ), "bit_rate": ( - "37209" - if file_path.endswith(audio_default_path) - else ( - "20746" - if file_path.endswith(audio_m4a_path) - else ( - None - if file_path.endswith(audio_webm_path) - else "128000" - ) - ) - ), - "duration_ts": ( - None if file_path.endswith(audio_webm_path) else "61200" + "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000" ), + "duration_ts": "61200", "sample_rate": ( - "44100" if file_path == str(original_path) else "48000" + "44100" if file_path.endswith("vbr7-3b2b0f13.mp3") else "48000" ), - "channels": 1 if file_path != str(original_path) else 2, + "channels": 2, } ], }, @@ -955,13 +717,12 @@ def test_audio_pipeline_media_to_download_checks_canonical_path( item=item, ) assert result is not None - assert result["path"] == audio_default_path + assert result["path"] == f"{audio_base_path}-vbr7-3b2b0f13.mp3" assert result["status"] == "uptodate" assert [variant.get("path") for variant in result["variants"]] == [ - audio_default_path, - audio_m4a_path, - audio_webm_path, + f"{audio_base_path}-vbr7-3b2b0f13.mp3", + f"{audio_base_path}-vbr3-4a2a58d5.aac", audio_base_path, ] assert f"{audio_base_path}.mp3" not in stat_paths - assert stat_paths[0] == audio_default_path + assert stat_paths[0] == f"{audio_base_path}-vbr7-3b2b0f13.mp3" diff --git a/uv.lock b/uv.lock index 857e52d..50e2792 100644 --- a/uv.lock +++ b/uv.lock @@ -936,7 +936,7 @@ wheels = [ [[package]] name = "pygea" version = "0.1.0" -source = { git = "https://guardianproject.dev/anynews/pygea.git#c58bac3abddc019c74a1de45835688461f39f2d0" } +source = { git = "https://guardianproject.dev/anynews/pygea.git#bff04afbf612612108d9651f355d066c4b5f6a64" } dependencies = [ { name = "beautifulsoup4" }, { name = "feedgen" },