Fix ffprobe handling for WebM and format families

This commit is contained in:
Abel Luck 2026-04-01 17:13:19 +02:00
parent 0504013c5a
commit 05ac6ce20d
4 changed files with 500 additions and 169 deletions

View file

@ -8,6 +8,7 @@ from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from repub import media
from repub import settings as repub_settings
from repub.config import (
FeedConfig,
RepublisherConfig,
@ -16,7 +17,12 @@ from repub.config import (
)
from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
from repub.utils import local_audio_path, local_video_path
from repub.utils import (
FileType,
local_audio_path,
local_video_path,
published_media_path,
)
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@ -309,6 +315,103 @@ def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
}
def test_audio_transcode_params_accepts_m4a_format_family() -> None:
params = media.audio_transcode_params(
{
"format": {
"bit_rate": "20000",
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "20000",
"duration_ts": "1",
}
],
},
cast(
media.AudioSettings,
{
"name": "m4a",
"format": "m4a",
"max_bitrate": 64000,
"mimetype": "audio/mp4",
"extension": "m4a",
"ffmpeg_audio_params": {
"acodec": "libfdk_aac",
"vbr": "2",
},
},
),
)
assert params is None
def test_audio_meta_handles_webm_without_duration_ts() -> None:
assert media.audio_meta(
{
"format": {
"duration": "1.0",
"size": "100",
"bit_rate": "48000",
"format_name": "matroska,webm",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
}
],
}
) == {
"duration": "1.0",
"fileSize": "100",
"bitrate": 48000,
"samplingrate": 48000,
"channels": 1,
}
def test_video_meta_handles_webm_without_duration_ts() -> None:
assert media.video_meta(
{
"format": {
"duration": "1.0",
"size": "200",
"bit_rate": "64000",
"format_name": "matroska,webm",
},
"streams": [
{
"codec_type": "video",
"codec_name": "vp9",
"width": 640,
"height": 360,
"avg_frame_rate": "25/1",
},
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
},
],
}
) == {
"duration": "1.0",
"fileSize": "200",
"width": 640,
"height": 360,
"bitrate": 64000,
"framerate": "25/1",
}
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
@ -337,13 +440,24 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
audio_default_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
)
audio_m4a_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
)
audio_webm_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
)
def fake_probe_media(file_path: str):
if file_path.endswith(".mp3-vbr7-3b2b0f13.mp3"):
file_name = Path(file_path).name
if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3":
return {
"format": {
"duration": "61.2",
"size": "4567",
"bit_rate": "96000",
"bit_rate": "37209",
"format_name": "mp3",
"format_long_name": "MP3",
},
@ -351,48 +465,69 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"bit_rate": "37209",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
"sample_rate": "48000",
"channels": 1,
}
],
}
if file_path.endswith(".mp3"):
if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a":
return {
"format": {
"duration": "61.2",
"size": "5678",
"bit_rate": "128000",
"format_name": "mp3",
"format_long_name": "MP3",
"size": "3456",
"bit_rate": "20746",
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
"format_long_name": "AAC",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "128000",
"codec_name": "aac",
"bit_rate": "20746",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
"sample_rate": "48000",
"channels": 1,
}
],
}
if (
file_path.endswith(audio_webm_path)
or file_name == "webm_opus_voice_48k.webm"
):
return {
"format": {
"duration": "61.2",
"size": "2345",
"bit_rate": "48000",
"format_name": "matroska,webm",
"format_long_name": "WebM",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
}
],
}
return {
"format": {
"duration": "61.2",
"size": "3456",
"bit_rate": "88000",
"format_name": "aac",
"format_long_name": "AAC",
"size": "5678",
"bit_rate": "128000",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "88000",
"codec_name": "mp3",
"bit_rate": "128000",
"duration_ts": "61200",
"sample_rate": "48000",
"sample_rate": "44100",
"channels": 2,
}
],
@ -423,43 +558,48 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
"path": audio_default_path,
"published_url": (
"https://mirror.example/feeds/nasa/audio/"
f"{audio_base_path}-vbr7-3b2b0f13.mp3"
f"https://mirror.example/feeds/nasa/audio/{audio_default_path}"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
"https://mirror.example/feeds/nasa/audio/"
f"{audio_base_path}-vbr7-3b2b0f13.mp3"
),
"path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
"type": "audio/mp3",
"url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}",
"path": audio_default_path,
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": 96000,
"bitrate": 37209,
"duration": "61.2",
"samplingrate": 44100,
"channels": 2,
"samplingrate": 48000,
"channels": 1,
},
{
"url": (
"https://mirror.example/feeds/nasa/audio/"
f"{audio_base_path}-vbr3-4a2a58d5.aac"
),
"path": f"{audio_base_path}-vbr3-4a2a58d5.aac",
"type": "audio/aac",
"url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}",
"path": audio_m4a_path,
"type": "audio/mp4",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": 88000,
"bitrate": 20746,
"duration": "61.2",
"samplingrate": 48000,
"channels": 2,
"channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}",
"path": audio_webm_path,
"type": "audio/webm",
"medium": "audio",
"isDefault": "false",
"fileSize": "2345",
"bitrate": 48000,
"duration": "61.2",
"samplingrate": 48000,
"channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
@ -477,8 +617,9 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
}
assert persisted == [
(audio_base_path, "audio/mpeg"),
(f"{audio_base_path}-vbr7-3b2b0f13.mp3", "audio/mp3"),
(f"{audio_base_path}-vbr3-4a2a58d5.aac", "audio/aac"),
(audio_default_path, "audio/mpeg"),
(audio_m4a_path, "audio/mp4"),
(audio_webm_path, "audio/webm"),
]
completed_item = pipeline.item_completed(
@ -518,23 +659,70 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
transcoded_suffix = "-720-457f0928.mp4"
monkeypatch.setattr(
media,
"probe_media",
lambda _: {
video_main_path = published_media_path(
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0]
)
video_fallback_path = published_media_path(
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1]
)
def fake_probe_media(file_path: str):
file_name = Path(file_path).name
if file_path.endswith(video_main_path) or file_name == "main.mp4":
return {
"format": {
"duration": "60.0",
"size": "9876",
"bit_rate": "123456",
"format_name": "mp4",
"format_long_name": "MP4",
},
"streams": [
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": "123456",
"duration_ts": "60000",
"width": 1280,
"height": 720,
"avg_frame_rate": "30/1",
},
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "96000",
"duration_ts": "60000",
},
],
}
if file_path.endswith(video_fallback_path) or file_name == "fallback.webm":
return {
"format": {
"duration": "60.0",
"size": "6789",
"bit_rate": "64000",
"format_name": "matroska,webm",
"format_long_name": "WebM",
},
"streams": [
{
"codec_type": "video",
"codec_name": "vp9",
"width": 1280,
"height": 720,
"avg_frame_rate": "25/1",
},
{
"codec_type": "audio",
"codec_name": "opus",
},
],
}
return {
"format": {
"duration": "60.0",
"size": (
"12345"
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "9876"
),
"bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "123456"
),
"size": "12345",
"bit_rate": "456789",
"format_name": "mp4",
"format_long_name": "MP4",
},
@ -542,27 +730,11 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "123456"
),
"bit_rate": "456789",
"duration_ts": "60000",
"width": (
640
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else 1280
),
"height": (
360
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else 720
),
"avg_frame_rate": (
"24/1"
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "30/1"
),
"width": 640,
"height": 360,
"avg_frame_rate": "24/1",
},
{
"codec_type": "audio",
@ -571,8 +743,9 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"duration_ts": "60000",
},
],
},
)
}
monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
@ -596,20 +769,14 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{video_base_path}-720-457f0928.mp4",
"published_url": (
"https://mirror.example/feeds/nasa/video/"
f"{video_base_path}-720-457f0928.mp4"
),
"path": video_main_path,
"published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
"https://mirror.example/feeds/nasa/video/"
f"{video_base_path}-720-457f0928.mp4"
),
"path": f"{video_base_path}-720-457f0928.mp4",
"url": f"https://mirror.example/feeds/nasa/video/{video_main_path}",
"path": video_main_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@ -620,6 +787,19 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"height": 720,
"framerate": "30/1",
},
{
"url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}",
"path": video_fallback_path,
"type": "video/webm",
"medium": "video",
"isDefault": "false",
"fileSize": "6789",
"bitrate": 64000,
"duration": "60.0",
"width": 1280,
"height": 720,
"framerate": "25/1",
},
{
"url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
"path": video_base_path,
@ -637,7 +817,8 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
}
assert persisted == [
(video_base_path, "video/mp4"),
(f"{video_base_path}-720-457f0928.mp4", "video/mp4"),
(video_main_path, "video/mp4"),
(video_fallback_path, "video/webm"),
]
@ -650,13 +831,24 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url)
original_path = store_dir(pipeline) / audio_base_path
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7-3b2b0f13.mp3"
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3-4a2a58d5.aac"
audio_default_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
)
audio_m4a_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
)
audio_webm_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
)
canonical_path = store_dir(pipeline) / audio_default_path
m4a_path = store_dir(pipeline) / audio_m4a_path
webm_path = store_dir(pipeline) / audio_webm_path
original_path.parent.mkdir(parents=True, exist_ok=True)
original_path.write_bytes(b"original")
canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default")
secondary_path.write_bytes(b"alt")
m4a_path.write_bytes(b"alt-aac")
webm_path.write_bytes(b"alt-webm")
stat_paths: list[str] = []
original_stat_file = pipeline.store.stat_file
item = ElementItem(
@ -683,12 +875,38 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
lambda file_path: {
"format": {
"duration": "61.2",
"size": ("4567" if file_path.endswith("vbr7-3b2b0f13.mp3") else "3456"),
"size": (
"4567"
if file_path.endswith(audio_default_path)
else (
"3456"
if file_path.endswith(audio_m4a_path)
else "2345" if file_path.endswith(audio_webm_path) else "5678"
)
),
"bit_rate": (
"96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000"
"37209"
if file_path.endswith(audio_default_path)
else (
"20746"
if file_path.endswith(audio_m4a_path)
else (
"48000" if file_path.endswith(audio_webm_path) else "128000"
)
)
),
"format_name": (
"mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac"
"mp3"
if file_path.endswith(audio_default_path)
else (
"mov,mp4,m4a,3gp,3g2,mj2"
if file_path.endswith(audio_m4a_path)
else (
"matroska,webm"
if file_path.endswith(audio_webm_path)
else "mp3"
)
)
),
"format_long_name": "Audio",
},
@ -696,16 +914,36 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
{
"codec_type": "audio",
"codec_name": (
"mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac"
"mp3"
if file_path.endswith(audio_default_path)
else (
"aac"
if file_path.endswith(audio_m4a_path)
else (
"opus" if file_path.endswith(audio_webm_path) else "mp3"
)
)
),
"bit_rate": (
"96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000"
"37209"
if file_path.endswith(audio_default_path)
else (
"20746"
if file_path.endswith(audio_m4a_path)
else (
None
if file_path.endswith(audio_webm_path)
else "128000"
)
)
),
"duration_ts": (
None if file_path.endswith(audio_webm_path) else "61200"
),
"duration_ts": "61200",
"sample_rate": (
"44100" if file_path.endswith("vbr7-3b2b0f13.mp3") else "48000"
"44100" if file_path == str(original_path) else "48000"
),
"channels": 2,
"channels": 1 if file_path != str(original_path) else 2,
}
],
},
@ -717,12 +955,13 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
item=item,
)
assert result is not None
assert result["path"] == f"{audio_base_path}-vbr7-3b2b0f13.mp3"
assert result["path"] == audio_default_path
assert result["status"] == "uptodate"
assert [variant.get("path") for variant in result["variants"]] == [
f"{audio_base_path}-vbr7-3b2b0f13.mp3",
f"{audio_base_path}-vbr3-4a2a58d5.aac",
audio_default_path,
audio_m4a_path,
audio_webm_path,
audio_base_path,
]
assert f"{audio_base_path}.mp3" not in stat_paths
assert stat_paths[0] == f"{audio_base_path}-vbr7-3b2b0f13.mp3"
assert stat_paths[0] == audio_default_path