From 0504013c5a9cf63bb797664dce166e4c361cfc5d Mon Sep 17 00:00:00 2001
From: Abel Luck
Date: Wed, 1 Apr 2026 16:58:06 +0200
Subject: [PATCH 1/4] update audio and video transcoding profiles for
compatibility
---
repub/settings.py | 114 ++++++++++++++++++++++++++--------------------
1 file changed, 65 insertions(+), 49 deletions(-)
diff --git a/repub/settings.py b/repub/settings.py
index d39b635..252c974 100644
--- a/repub/settings.py
+++ b/repub/settings.py
@@ -102,79 +102,95 @@ MEDIA_ALLOW_REDIRECTS = True
REPUBLISHER_AUDIO = [
{
- "name": "vbr7",
+ "name": "mp3_vbr7_voice",
"format": "mp3",
- "max_bitrate": 96000,
- "mimetype": "audio/mp3",
+ "max_bitrate": 64000,
+ "mimetype": "audio/mpeg",
"extension": "mp3",
"ffmpeg_audio_params": {
"acodec": "libmp3lame",
- # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding
"qscale:a": "7",
+ "ac": "1",
+ "ar": "48000",
},
},
{
- "name": "vbr3",
- "format": "aac",
- "max_bitrate": 96000,
- "mimetype": "audio/aac",
- "extension": "aac",
+ "name": "m4a_aac_vbr2_voice",
+ "format": "m4a",
+ "max_bitrate": 64000,
+ "mimetype": "audio/mp4",
+ "extension": "m4a",
"ffmpeg_audio_params": {
"acodec": "libfdk_aac",
- # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding
- "vbr": "3",
+ "vbr": "2",
+ "ac": "1",
+ "ar": "48000",
+ },
+ },
+ {
+ "name": "webm_opus_voice_48k",
+ "format": "webm",
+ "max_bitrate": 48000,
+ "mimetype": "audio/webm",
+ "extension": "webm",
+ "ffmpeg_audio_params": {
+ "acodec": "libopus",
+ "b:a": "48k",
+ "ac": "1",
+ "ar": "48000",
+ "application": "voip",
},
},
]
REPUBLISHER_VIDEO = [
+ # broadly compatible
{
- "name": "720",
+ "name": "main",
"container": "mp4",
"vcodec": "h264",
- "acodec": "mp3",
+ "acodec": "aac",
"audio_max_bitrate": 96000,
"ffmpeg_audio_params": {
- "acodec": "libmp3lame",
- # https://trac.ffmpeg.org/wiki/Encode/MP3#VBREncoding
- "qscale:a": "7",
+ "acodec": "aac",
+ "b:a": "96k",
+ "ac": "2",
+ "ar": "48000",
+ },
+ "ffmpeg_video_params": {
+ "vcodec": "libx264",
+ "pix_fmt": "yuv420p",
+ "profile:v": "main",
+ "level": "4.0",
+ "preset": "medium",
+ "crf": "22",
+ "movflags": "+faststart",
},
- "ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"},
"max_height": 720,
"mimetype": "video/mp4",
"extension": "mp4",
},
- # {
- # "passes": [
- # {
- # "c:v": "libvpx-vp9",
- # "b:v": "0",
- # "crf": "30",
- # "pass": "1",
- # "deadline": "good",
- # "row-mt": "1",
- # "f": "null",
- # },
- # {
- # "c:v": "libvpx-vp9",
- # "b:v": "0",
- # "crf": "30",
- # "pass": "2",
- # "deadline": "good",
- # "row-mt": "1",
- # "c:a": "libopus",
- # "b:a": "96k",
- # "ac": "2",
- # },
- # ],
- # "name": "720",
- # "container": "webm",
- # "vcodec": "libvpx-vp9",
- # "acodec": "opus",
- # "audio_max_bitrate": 96000,
- # "max_height": 720,
- # "mimetype": "video/webm",
- # "extension": "webm",
- # },
+ # linux fallback without patent encumberance
+ {
+ "name": "fallback",
+ "container": "webm",
+ "vcodec": "vp9",
+ "acodec": "opus",
+ "audio_max_bitrate": 96000,
+ "ffmpeg_audio_params": {
+ "acodec": "libopus",
+ "b:a": "96k",
+ "ac": "2",
+ "ar": "48000",
+ },
+ "ffmpeg_video_params": {
+ "vcodec": "libvpx-vp9",
+ "crf": "33",
+ "b:v": "0",
+ },
+ "max_height": 720,
+ "mimetype": "video/webm",
+ "extension": "webm",
+ },
]
REPUBLISHER_FFMPEG_ENCODERS = ["libmp3lame", "libfdk_aac", "libvpx-vp9", "libopus"]
From 05ac6ce20d9972a05769bb71a3fdb591ffb780a0 Mon Sep 17 00:00:00 2001
From: Abel Luck
Date: Wed, 1 Apr 2026 17:13:19 +0200
Subject: [PATCH 2/4] Fix ffprobe handling for WebM and format families
---
repub/media.py | 45 ++--
tests/test_feed_validation.py | 153 ++++++++----
tests/test_file_feeds.py | 18 +-
tests/test_pipelines.py | 453 ++++++++++++++++++++++++++--------
4 files changed, 500 insertions(+), 169 deletions(-)
diff --git a/repub/media.py b/repub/media.py
index fe8074b..aec3a4d 100644
--- a/repub/media.py
+++ b/repub/media.py
@@ -77,7 +77,7 @@ def probe_media(file_path) -> Dict[str, Any]:
def bitrate(info) -> float:
try:
return int(info["format"]["bit_rate"])
- except KeyError | ValueError:
+ except (KeyError, ValueError):
logger.error("extracting bitrate from ffprobe failed")
return math.inf
@@ -85,16 +85,34 @@ def bitrate(info) -> float:
def format_name(info) -> Optional[str]:
try:
return info["format"]["format_name"]
- except KeyError | ValueError:
+ except (KeyError, ValueError):
logger.error("extracting format from ffprobe failed")
return None
+def _stream_duration_sort_key(stream: Dict[str, Any]) -> tuple[int, float]:
+ duration_ts = _int_value(stream.get("duration_ts"))
+ if duration_ts is not None:
+ return 1, float(duration_ts)
+ try:
+ duration = float(str(stream.get("duration", "")))
+ except (TypeError, ValueError):
+ duration = 0.0
+ return 0, duration
+
+
+def _matches_format(probe: Dict[str, Any], expected: str) -> bool:
+ current = format_name(probe)
+ if current is None:
+ return False
+ return expected in current.split(",")
+
+
def primary_video_stream(probe):
video_streams = [
stream for stream in probe["streams"] if stream["codec_type"] == "video"
]
- video_streams = sorted(video_streams, key=lambda x: x["duration_ts"], reverse=True)
+ video_streams = sorted(video_streams, key=_stream_duration_sort_key, reverse=True)
if not video_streams:
return None
if len(video_streams) > 1:
@@ -108,7 +126,7 @@ def primary_audio_stream(probe):
audio_streams = [
stream for stream in probe["streams"] if stream["codec_type"] == "audio"
]
- audio_streams = sorted(audio_streams, key=lambda x: x["duration_ts"], reverse=True)
+ audio_streams = sorted(audio_streams, key=_stream_duration_sort_key, reverse=True)
if not audio_streams:
return None
if len(audio_streams) > 1:
@@ -126,7 +144,7 @@ def get_resolution(probe) -> Tuple[Optional[float], Optional[float]]:
width = int(video_stream["width"])
height = int(video_stream["height"])
return width, height
- except KeyError | ValueError:
+ except (KeyError, ValueError):
logger.error("extracting resolution from ffprobe failed")
return None, None
@@ -137,7 +155,7 @@ def get_vcodec_name(probe) -> Optional[str]:
if not video_stream:
return None
return video_stream["codec_name"]
- except KeyError | ValueError:
+ except (KeyError, ValueError):
logger.error("extracting video codec_name from ffprobe failed")
return None
@@ -147,8 +165,11 @@ def get_acodec_info(probe) -> Tuple[Optional[str], Optional[int]]:
audio_stream = primary_audio_stream(probe)
if not audio_stream:
return None, None
- return audio_stream["codec_name"], int(audio_stream["bit_rate"])
- except KeyError | ValueError:
+ audio_bitrate = _int_value(
+ audio_stream.get("bit_rate") or probe["format"].get("bit_rate")
+ )
+ return audio_stream["codec_name"], audio_bitrate
+ except (KeyError, ValueError):
logger.error("extracting audio codec_name from ffprobe failed")
return None, None
@@ -218,7 +239,7 @@ def audio_transcode_params(
is_br = True
else:
is_br = False
- if format_name(probe_result) == fmt:
+ if _matches_format(probe_result, fmt):
is_fmt = True
else:
is_fmt = False
@@ -289,11 +310,7 @@ def video_transcode_params(
# TODO: turn this into an exception and catch it for reporting
return None
- current_container_many = format_name(probe_result)
- is_container = False
- if current_container_many is not None:
- if target_container in current_container_many.split(","):
- is_container = True
+ is_container = _matches_format(probe_result, target_container)
is_vcodec = vcodec == target_vcodec
is_acodec = acodec == target_acodec
diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py
index 22589b4..290a90a 100644
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@@ -14,7 +14,13 @@ from repub.exporters import RssExporter
from repub.items import ElementItem
from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
-from repub.utils import local_audio_path, local_image_path, local_video_path
+from repub.utils import (
+ FileType,
+ local_audio_path,
+ local_image_path,
+ local_video_path,
+ published_media_path,
+)
RSS_DATE_PATTERN = re.compile(
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
@@ -69,17 +75,32 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
+ audio_base_path = local_audio_path(source_audio)
+ audio_default_path = published_media_path(
+ FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
+ )
+ audio_m4a_path = published_media_path(
+ FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[1]
+ )
+ audio_webm_path = published_media_path(
+ FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[2]
+ )
+ video_base_path = local_video_path(source_video)
+ video_main_path = published_media_path(
+ FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[0]
+ )
+ video_fallback_path = published_media_path(
+ FileType.VIDEO, source_video, repub_settings.REPUBLISHER_VIDEO[1]
+ )
def prepare_item(item: ElementItem) -> None:
- audio_base_path = local_audio_path(source_audio)
- video_base_path = local_video_path(source_video)
item.audios = [
{
"url": source_audio,
- "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
+ "path": audio_default_path,
"published_url": _published_url(
"https://mirror.example",
- f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3",
+ f"audio/{audio_default_path}",
),
"checksum": "audio-default",
"status": "downloaded",
@@ -87,32 +108,47 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
{
"url": _published_url(
"https://mirror.example",
- f"audio/{audio_base_path}-vbr7-3b2b0f13.mp3",
+ f"audio/{audio_default_path}",
),
- "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
- "type": "audio/mp3",
+ "path": audio_default_path,
+ "type": "audio/mpeg",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
- "bitrate": "96000",
+ "bitrate": "37209",
"duration": "61.2",
- "samplingrate": "44100",
- "channels": "2",
+ "samplingrate": "48000",
+ "channels": "1",
},
{
"url": _published_url(
"https://mirror.example",
- f"audio/{audio_base_path}-vbr3-4a2a58d5.aac",
+ f"audio/{audio_m4a_path}",
),
- "path": f"{audio_base_path}-vbr3-4a2a58d5.aac",
- "type": "audio/aac",
+ "path": audio_m4a_path,
+ "type": "audio/mp4",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
- "bitrate": "88000",
+ "bitrate": "20746",
"duration": "61.2",
"samplingrate": "48000",
- "channels": "2",
+ "channels": "1",
+ },
+ {
+ "url": _published_url(
+ "https://mirror.example",
+ f"audio/{audio_webm_path}",
+ ),
+ "path": audio_webm_path,
+ "type": "audio/webm",
+ "medium": "audio",
+ "isDefault": "false",
+ "fileSize": "2345",
+ "bitrate": "48000",
+ "duration": "61.2",
+ "samplingrate": "48000",
+ "channels": "1",
},
{
"url": _published_url(
@@ -135,10 +171,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
item.videos = [
{
"url": source_video,
- "path": f"{video_base_path}-720-457f0928.mp4",
+ "path": video_main_path,
"published_url": _published_url(
"https://mirror.example",
- f"video/{video_base_path}-720-457f0928.mp4",
+ f"video/{video_main_path}",
),
"checksum": "video-default",
"status": "downloaded",
@@ -146,9 +182,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
{
"url": _published_url(
"https://mirror.example",
- f"video/{video_base_path}-720-457f0928.mp4",
+ f"video/{video_main_path}",
),
- "path": f"{video_base_path}-720-457f0928.mp4",
+ "path": video_main_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@@ -159,6 +195,22 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"height": "720",
"framerate": "30/1",
},
+ {
+ "url": _published_url(
+ "https://mirror.example",
+ f"video/{video_fallback_path}",
+ ),
+ "path": video_fallback_path,
+ "type": "video/webm",
+ "medium": "video",
+ "isDefault": "false",
+ "fileSize": "6789",
+ "bitrate": "64000",
+ "duration": "60.0",
+ "width": "1280",
+ "height": "720",
+ "framerate": "25/1",
+ },
{
"url": _published_url(
"https://mirror.example",
@@ -257,12 +309,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
enclosure = root.find("./channel/item/enclosure")
assert enclosure is not None
assert enclosure.attrib == {
- "url": (
- f"https://mirror.example/feeds/demo/audio/"
- f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3"
- ),
+ "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"),
"length": "4567",
- "type": "audio/mp3",
+ "type": "audio/mpeg",
}
assert len(enclosure) == 0
@@ -276,32 +325,39 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert [variant.attrib for variant in audio_variants] == [
{
"url": (
- f"https://mirror.example/feeds/demo/audio/"
- f"{local_audio_path(source_audio)}-vbr7-3b2b0f13.mp3"
+ f"https://mirror.example/feeds/demo/audio/" f"{audio_default_path}"
),
- "type": "audio/mp3",
+ "type": "audio/mpeg",
"medium": "audio",
"isDefault": "true",
- "bitrate": "96000",
- "samplingrate": "44100",
- "channels": "2",
+ "bitrate": "37209",
+ "samplingrate": "48000",
+ "channels": "1",
"duration": "61.2",
"fileSize": "4567",
},
{
- "url": (
- f"https://mirror.example/feeds/demo/audio/"
- f"{local_audio_path(source_audio)}-vbr3-4a2a58d5.aac"
- ),
- "type": "audio/aac",
+ "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_m4a_path}"),
+ "type": "audio/mp4",
"medium": "audio",
"isDefault": "false",
- "bitrate": "88000",
+ "bitrate": "20746",
"samplingrate": "48000",
- "channels": "2",
+ "channels": "1",
"duration": "61.2",
"fileSize": "3456",
},
+ {
+ "url": (f"https://mirror.example/feeds/demo/audio/" f"{audio_webm_path}"),
+ "type": "audio/webm",
+ "medium": "audio",
+ "isDefault": "false",
+ "bitrate": "48000",
+ "samplingrate": "48000",
+ "channels": "1",
+ "duration": "61.2",
+ "fileSize": "2345",
+ },
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
@@ -321,10 +377,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
video_variants = video_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in video_variants] == [
{
- "url": (
- f"https://mirror.example/feeds/demo/video/"
- f"{local_video_path(source_video)}-720-457f0928.mp4"
- ),
+ "url": (f"https://mirror.example/feeds/demo/video/" f"{video_main_path}"),
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@@ -337,6 +390,22 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"lang": "en",
"fileSize": "9876",
},
+ {
+ "url": (
+ f"https://mirror.example/feeds/demo/video/" f"{video_fallback_path}"
+ ),
+ "type": "video/webm",
+ "medium": "video",
+ "isDefault": "false",
+ "expression": "full",
+ "bitrate": "64000",
+ "framerate": "25/1",
+ "duration": "60.0",
+ "height": "720",
+ "width": "1280",
+ "lang": "en",
+ "fileSize": "6789",
+ },
{
"url": (
f"https://mirror.example/feeds/demo/video/"
diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py
index 66246f3..ff43b6a 100644
--- a/tests/test_file_feeds.py
+++ b/tests/test_file_feeds.py
@@ -71,16 +71,22 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
"https://example.com/media/podcast.mp3",
) == (
"audio/"
- f"{local_audio_path('https://example.com/media/podcast.mp3')}"
- "-vbr7-3b2b0f13.mp3"
+ + published_media_path(
+ FileType.AUDIO,
+ "https://example.com/media/podcast.mp3",
+ repub_settings.REPUBLISHER_AUDIO[0],
+ )
)
assert spider.rewrite_file_url(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
) == (
"video/"
- f"{local_video_path('https://example.com/media/clip.mp4')}"
- "-720-457f0928.mp4"
+ + published_media_path(
+ FileType.VIDEO,
+ "https://example.com/media/clip.mp4",
+ repub_settings.REPUBLISHER_VIDEO[0],
+ )
)
@@ -90,10 +96,10 @@ def test_published_media_path_changes_when_profile_args_change() -> None:
base_profile = repub_settings.REPUBLISHER_VIDEO[0]
assert published_media_path(FileType.AUDIO, source_url, audio_profile) == (
- f"{local_audio_path(source_url)}-vbr7-3b2b0f13.mp3"
+ f"{local_audio_path(source_url)}-mp3_vbr7_voice-1cc131cf.mp3"
)
assert published_media_path(FileType.VIDEO, source_url, base_profile) == (
- f"{local_video_path(source_url)}-720-457f0928.mp4"
+ f"{local_video_path(source_url)}-main-4fb03ba0.mp4"
)
changed_audio_profile = {**audio_profile, "max_bitrate": 128000}
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index e82672b..523f9bd 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -8,6 +8,7 @@ from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from repub import media
+from repub import settings as repub_settings
from repub.config import (
FeedConfig,
RepublisherConfig,
@@ -16,7 +17,12 @@ from repub.config import (
)
from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
-from repub.utils import local_audio_path, local_video_path
+from repub.utils import (
+ FileType,
+ local_audio_path,
+ local_video_path,
+ published_media_path,
+)
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@@ -309,6 +315,103 @@ def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
}
+def test_audio_transcode_params_accepts_m4a_format_family() -> None:
+ params = media.audio_transcode_params(
+ {
+ "format": {
+ "bit_rate": "20000",
+ "format_name": "mov,mp4,m4a,3gp,3g2,mj2",
+ },
+ "streams": [
+ {
+ "codec_type": "audio",
+ "codec_name": "aac",
+ "bit_rate": "20000",
+ "duration_ts": "1",
+ }
+ ],
+ },
+ cast(
+ media.AudioSettings,
+ {
+ "name": "m4a",
+ "format": "m4a",
+ "max_bitrate": 64000,
+ "mimetype": "audio/mp4",
+ "extension": "m4a",
+ "ffmpeg_audio_params": {
+ "acodec": "libfdk_aac",
+ "vbr": "2",
+ },
+ },
+ ),
+ )
+
+ assert params is None
+
+
+def test_audio_meta_handles_webm_without_duration_ts() -> None:
+ assert media.audio_meta(
+ {
+ "format": {
+ "duration": "1.0",
+ "size": "100",
+ "bit_rate": "48000",
+ "format_name": "matroska,webm",
+ },
+ "streams": [
+ {
+ "codec_type": "audio",
+ "codec_name": "opus",
+ "sample_rate": "48000",
+ "channels": 1,
+ }
+ ],
+ }
+ ) == {
+ "duration": "1.0",
+ "fileSize": "100",
+ "bitrate": 48000,
+ "samplingrate": 48000,
+ "channels": 1,
+ }
+
+
+def test_video_meta_handles_webm_without_duration_ts() -> None:
+ assert media.video_meta(
+ {
+ "format": {
+ "duration": "1.0",
+ "size": "200",
+ "bit_rate": "64000",
+ "format_name": "matroska,webm",
+ },
+ "streams": [
+ {
+ "codec_type": "video",
+ "codec_name": "vp9",
+ "width": 640,
+ "height": 360,
+ "avg_frame_rate": "25/1",
+ },
+ {
+ "codec_type": "audio",
+ "codec_name": "opus",
+ "sample_rate": "48000",
+ "channels": 1,
+ },
+ ],
+ }
+ ) == {
+ "duration": "1.0",
+ "fileSize": "200",
+ "width": 640,
+ "height": 360,
+ "bitrate": 64000,
+ "framerate": "25/1",
+ }
+
+
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
@@ -337,13 +440,24 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
+ audio_default_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
+ )
+ audio_m4a_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
+ )
+ audio_webm_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
+ )
+
def fake_probe_media(file_path: str):
- if file_path.endswith(".mp3-vbr7-3b2b0f13.mp3"):
+ file_name = Path(file_path).name
+ if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3":
return {
"format": {
"duration": "61.2",
"size": "4567",
- "bit_rate": "96000",
+ "bit_rate": "37209",
"format_name": "mp3",
"format_long_name": "MP3",
},
@@ -351,48 +465,69 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
{
"codec_type": "audio",
"codec_name": "mp3",
- "bit_rate": "96000",
+ "bit_rate": "37209",
"duration_ts": "61200",
- "sample_rate": "44100",
- "channels": 2,
+ "sample_rate": "48000",
+ "channels": 1,
}
],
}
- if file_path.endswith(".mp3"):
+ if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a":
return {
"format": {
"duration": "61.2",
- "size": "5678",
- "bit_rate": "128000",
- "format_name": "mp3",
- "format_long_name": "MP3",
+ "size": "3456",
+ "bit_rate": "20746",
+ "format_name": "mov,mp4,m4a,3gp,3g2,mj2",
+ "format_long_name": "AAC",
},
"streams": [
{
"codec_type": "audio",
- "codec_name": "mp3",
- "bit_rate": "128000",
+ "codec_name": "aac",
+ "bit_rate": "20746",
"duration_ts": "61200",
- "sample_rate": "44100",
- "channels": 2,
+ "sample_rate": "48000",
+ "channels": 1,
+ }
+ ],
+ }
+ if (
+ file_path.endswith(audio_webm_path)
+ or file_name == "webm_opus_voice_48k.webm"
+ ):
+ return {
+ "format": {
+ "duration": "61.2",
+ "size": "2345",
+ "bit_rate": "48000",
+ "format_name": "matroska,webm",
+ "format_long_name": "WebM",
+ },
+ "streams": [
+ {
+ "codec_type": "audio",
+ "codec_name": "opus",
+ "sample_rate": "48000",
+ "channels": 1,
}
],
}
return {
"format": {
"duration": "61.2",
- "size": "3456",
- "bit_rate": "88000",
- "format_name": "aac",
- "format_long_name": "AAC",
+ "size": "5678",
+ "bit_rate": "128000",
+ "format_name": "mp3",
+ "format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
- "codec_name": "aac",
- "bit_rate": "88000",
+ "codec_name": "mp3",
+ "bit_rate": "128000",
"duration_ts": "61200",
- "sample_rate": "48000",
+ "sample_rate": "44100",
"channels": 2,
}
],
@@ -423,43 +558,48 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
- "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
+ "path": audio_default_path,
"published_url": (
- "https://mirror.example/feeds/nasa/audio/"
- f"{audio_base_path}-vbr7-3b2b0f13.mp3"
+ f"https://mirror.example/feeds/nasa/audio/{audio_default_path}"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
- "url": (
- "https://mirror.example/feeds/nasa/audio/"
- f"{audio_base_path}-vbr7-3b2b0f13.mp3"
- ),
- "path": f"{audio_base_path}-vbr7-3b2b0f13.mp3",
- "type": "audio/mp3",
+ "url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}",
+ "path": audio_default_path,
+ "type": "audio/mpeg",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
- "bitrate": 96000,
+ "bitrate": 37209,
"duration": "61.2",
- "samplingrate": 44100,
- "channels": 2,
+ "samplingrate": 48000,
+ "channels": 1,
},
{
- "url": (
- "https://mirror.example/feeds/nasa/audio/"
- f"{audio_base_path}-vbr3-4a2a58d5.aac"
- ),
- "path": f"{audio_base_path}-vbr3-4a2a58d5.aac",
- "type": "audio/aac",
+ "url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}",
+ "path": audio_m4a_path,
+ "type": "audio/mp4",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
- "bitrate": 88000,
+ "bitrate": 20746,
"duration": "61.2",
"samplingrate": 48000,
- "channels": 2,
+ "channels": 1,
+ },
+ {
+ "url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}",
+ "path": audio_webm_path,
+ "type": "audio/webm",
+ "medium": "audio",
+ "isDefault": "false",
+ "fileSize": "2345",
+ "bitrate": 48000,
+ "duration": "61.2",
+ "samplingrate": 48000,
+ "channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
@@ -477,8 +617,9 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
}
assert persisted == [
(audio_base_path, "audio/mpeg"),
- (f"{audio_base_path}-vbr7-3b2b0f13.mp3", "audio/mp3"),
- (f"{audio_base_path}-vbr3-4a2a58d5.aac", "audio/aac"),
+ (audio_default_path, "audio/mpeg"),
+ (audio_m4a_path, "audio/mp4"),
+ (audio_webm_path, "audio/webm"),
]
completed_item = pipeline.item_completed(
@@ -518,23 +659,70 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
- transcoded_suffix = "-720-457f0928.mp4"
- monkeypatch.setattr(
- media,
- "probe_media",
- lambda _: {
+ video_main_path = published_media_path(
+ FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0]
+ )
+ video_fallback_path = published_media_path(
+ FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1]
+ )
+
+ def fake_probe_media(file_path: str):
+ file_name = Path(file_path).name
+ if file_path.endswith(video_main_path) or file_name == "main.mp4":
+ return {
+ "format": {
+ "duration": "60.0",
+ "size": "9876",
+ "bit_rate": "123456",
+ "format_name": "mp4",
+ "format_long_name": "MP4",
+ },
+ "streams": [
+ {
+ "codec_type": "video",
+ "codec_name": "h264",
+ "bit_rate": "123456",
+ "duration_ts": "60000",
+ "width": 1280,
+ "height": 720,
+ "avg_frame_rate": "30/1",
+ },
+ {
+ "codec_type": "audio",
+ "codec_name": "aac",
+ "bit_rate": "96000",
+ "duration_ts": "60000",
+ },
+ ],
+ }
+ if file_path.endswith(video_fallback_path) or file_name == "fallback.webm":
+ return {
+ "format": {
+ "duration": "60.0",
+ "size": "6789",
+ "bit_rate": "64000",
+ "format_name": "matroska,webm",
+ "format_long_name": "WebM",
+ },
+ "streams": [
+ {
+ "codec_type": "video",
+ "codec_name": "vp9",
+ "width": 1280,
+ "height": 720,
+ "avg_frame_rate": "25/1",
+ },
+ {
+ "codec_type": "audio",
+ "codec_name": "opus",
+ },
+ ],
+ }
+ return {
"format": {
"duration": "60.0",
- "size": (
- "12345"
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else "9876"
- ),
- "bit_rate": (
- "456789"
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else "123456"
- ),
+ "size": "12345",
+ "bit_rate": "456789",
"format_name": "mp4",
"format_long_name": "MP4",
},
@@ -542,27 +730,11 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
{
"codec_type": "video",
"codec_name": "h264",
- "bit_rate": (
- "456789"
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else "123456"
- ),
+ "bit_rate": "456789",
"duration_ts": "60000",
- "width": (
- 640
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else 1280
- ),
- "height": (
- 360
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else 720
- ),
- "avg_frame_rate": (
- "24/1"
- if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
- else "30/1"
- ),
+ "width": 640,
+ "height": 360,
+ "avg_frame_rate": "24/1",
},
{
"codec_type": "audio",
@@ -571,8 +743,9 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"duration_ts": "60000",
},
],
- },
- )
+ }
+
+ monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
@@ -596,20 +769,14 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
- "path": f"{video_base_path}-720-457f0928.mp4",
- "published_url": (
- "https://mirror.example/feeds/nasa/video/"
- f"{video_base_path}-720-457f0928.mp4"
- ),
+ "path": video_main_path,
+ "published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
- "url": (
- "https://mirror.example/feeds/nasa/video/"
- f"{video_base_path}-720-457f0928.mp4"
- ),
- "path": f"{video_base_path}-720-457f0928.mp4",
+ "url": f"https://mirror.example/feeds/nasa/video/{video_main_path}",
+ "path": video_main_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@@ -620,6 +787,19 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"height": 720,
"framerate": "30/1",
},
+ {
+ "url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}",
+ "path": video_fallback_path,
+ "type": "video/webm",
+ "medium": "video",
+ "isDefault": "false",
+ "fileSize": "6789",
+ "bitrate": 64000,
+ "duration": "60.0",
+ "width": 1280,
+ "height": 720,
+ "framerate": "25/1",
+ },
{
"url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
"path": video_base_path,
@@ -637,7 +817,8 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
}
assert persisted == [
(video_base_path, "video/mp4"),
- (f"{video_base_path}-720-457f0928.mp4", "video/mp4"),
+ (video_main_path, "video/mp4"),
+ (video_fallback_path, "video/webm"),
]
@@ -650,13 +831,24 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url)
original_path = store_dir(pipeline) / audio_base_path
- canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7-3b2b0f13.mp3"
- secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3-4a2a58d5.aac"
+ audio_default_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
+ )
+ audio_m4a_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
+ )
+ audio_webm_path = published_media_path(
+ FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
+ )
+ canonical_path = store_dir(pipeline) / audio_default_path
+ m4a_path = store_dir(pipeline) / audio_m4a_path
+ webm_path = store_dir(pipeline) / audio_webm_path
original_path.parent.mkdir(parents=True, exist_ok=True)
original_path.write_bytes(b"original")
canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default")
- secondary_path.write_bytes(b"alt")
+ m4a_path.write_bytes(b"alt-aac")
+ webm_path.write_bytes(b"alt-webm")
stat_paths: list[str] = []
original_stat_file = pipeline.store.stat_file
item = ElementItem(
@@ -683,12 +875,38 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
lambda file_path: {
"format": {
"duration": "61.2",
- "size": ("4567" if file_path.endswith("vbr7-3b2b0f13.mp3") else "3456"),
+ "size": (
+ "4567"
+ if file_path.endswith(audio_default_path)
+ else (
+ "3456"
+ if file_path.endswith(audio_m4a_path)
+ else "2345" if file_path.endswith(audio_webm_path) else "5678"
+ )
+ ),
"bit_rate": (
- "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000"
+ "37209"
+ if file_path.endswith(audio_default_path)
+ else (
+ "20746"
+ if file_path.endswith(audio_m4a_path)
+ else (
+ "48000" if file_path.endswith(audio_webm_path) else "128000"
+ )
+ )
),
"format_name": (
- "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac"
+ "mp3"
+ if file_path.endswith(audio_default_path)
+ else (
+ "mov,mp4,m4a,3gp,3g2,mj2"
+ if file_path.endswith(audio_m4a_path)
+ else (
+ "matroska,webm"
+ if file_path.endswith(audio_webm_path)
+ else "mp3"
+ )
+ )
),
"format_long_name": "Audio",
},
@@ -696,16 +914,36 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
{
"codec_type": "audio",
"codec_name": (
- "mp3" if file_path.endswith("vbr7-3b2b0f13.mp3") else "aac"
+ "mp3"
+ if file_path.endswith(audio_default_path)
+ else (
+ "aac"
+ if file_path.endswith(audio_m4a_path)
+ else (
+ "opus" if file_path.endswith(audio_webm_path) else "mp3"
+ )
+ )
),
"bit_rate": (
- "96000" if file_path.endswith("vbr7-3b2b0f13.mp3") else "88000"
+ "37209"
+ if file_path.endswith(audio_default_path)
+ else (
+ "20746"
+ if file_path.endswith(audio_m4a_path)
+ else (
+ None
+ if file_path.endswith(audio_webm_path)
+ else "128000"
+ )
+ )
+ ),
+ "duration_ts": (
+ None if file_path.endswith(audio_webm_path) else "61200"
),
- "duration_ts": "61200",
"sample_rate": (
- "44100" if file_path.endswith("vbr7-3b2b0f13.mp3") else "48000"
+ "44100" if file_path == str(original_path) else "48000"
),
- "channels": 2,
+ "channels": 1 if file_path != str(original_path) else 2,
}
],
},
@@ -717,12 +955,13 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
item=item,
)
assert result is not None
- assert result["path"] == f"{audio_base_path}-vbr7-3b2b0f13.mp3"
+ assert result["path"] == audio_default_path
assert result["status"] == "uptodate"
assert [variant.get("path") for variant in result["variants"]] == [
- f"{audio_base_path}-vbr7-3b2b0f13.mp3",
- f"{audio_base_path}-vbr3-4a2a58d5.aac",
+ audio_default_path,
+ audio_m4a_path,
+ audio_webm_path,
audio_base_path,
]
assert f"{audio_base_path}.mp3" not in stat_paths
- assert stat_paths[0] == f"{audio_base_path}-vbr7-3b2b0f13.mp3"
+ assert stat_paths[0] == audio_default_path
From cebf037753d4240d793627f8d3d465c7bfbebe23 Mon Sep 17 00:00:00 2001
From: Abel Luck
Date: Wed, 1 Apr 2026 17:27:20 +0200
Subject: [PATCH 3/4] Prefer content over description for item bodies
---
repub/spiders/rss_spider.py | 12 ++++++-
tests/test_feed_validation.py | 62 +++++++++++++++++++++++++++++++----
2 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py
index 80be20e..fa27317 100644
--- a/repub/spiders/rss_spider.py
+++ b/repub/spiders/rss_spider.py
@@ -281,6 +281,14 @@ class RssFeedSpider(BaseRssFeedSpider):
file_urls = []
audio_urls = []
video_urls = []
+ source_description_html = (
+ sanitize_html(entry.get("summary", "")) if "summary_detail" in entry else ""
+ )
+ has_content_html = any(
+ c.type == "text/html" and ((getattr(c, "value", "") or "").strip() != "")
+ for c in entry.get("content", [])
+ )
+ description_html = source_description_html if has_content_html else ""
def add_url(file_type, url):
if file_type == FileType.IMAGE:
@@ -295,7 +303,7 @@ class RssFeedSpider(BaseRssFeedSpider):
item = E.item(
E.title(entry.get("title")),
E.link(entry.get("link")),
- E.description(sanitize_html(entry.get("description", ""))),
+ E.description(description_html),
E.guid(
entry.get("id"),
{"isPermaLink": "true" if entry.guidislink else "false"},
@@ -341,6 +349,8 @@ class RssFeedSpider(BaseRssFeedSpider):
image_urls.extend(urls[FileType.IMAGE])
video_urls.extend(urls[FileType.VIDEO])
audio_urls.extend(urls[FileType.AUDIO])
+ if not has_content_html and source_description_html.strip() != "":
+ item.append(CONTENT.encoded(CDATA(source_description_html)))
if isinstance(entry.get("media_content"), list):
for media in (
diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py
index 290a90a..9e1f80b 100644
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@@ -437,10 +437,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
assert "<" not in itunes_summary
assert ">" not in itunes_summary
- assert "contenteditable=" not in xml
- assert "mode=" not in xml
- assert "querystring=" not in xml
- assert (
- f"https://mirror.example/feeds/demo/images/{local_image_path(source_image)}"
- in xml
+
+def test_item_body_uses_description_only_when_content_is_also_present() -> None:
+ xml, root = _serialize_feed(
+ feed_url="https://mirror.example",
+ feed_text="""
+
+
+ Demo Feed
+ https://source.example/feed
+ Demo description
+ -
+ Description Only
+ https://source.example/description-only
+ Description body
]]>
+ entry-description-only
+ Tue, 31 Mar 2026 10:31:50 +0000
+
+ -
+ Content Only
+ https://source.example/content-only
+ entry-content-only
+ Tue, 31 Mar 2026 10:31:50 +0000
+ Content body]]>
+
+ -
+ Both Present
+ https://source.example/both-present
+ Summary body]]>
+ entry-both-present
+ Tue, 31 Mar 2026 10:31:50 +0000
+ Full body]]>
+
+
+
+""",
+ )
+
+ items = root.findall("./channel/item")
+ assert len(items) == 3
+
+ description_only, content_only, both_present = items
+
+ assert description_only.findtext("description") in (None, "")
+ assert description_only.findtext("content:encoded", namespaces=nsmap) == (
+ "Description body
"
+ )
+
+ assert content_only.findtext("description") in (None, "")
+ assert content_only.findtext("content:encoded", namespaces=nsmap) == (
+ "Content body
"
+ )
+
+ assert both_present.findtext("description") == "Summary body
"
+ assert both_present.findtext("content:encoded", namespaces=nsmap) == (
+ "Full body
"
)
From 180677efa71b97f0a9d9cd2d4ea0e4bcdac79a98 Mon Sep 17 00:00:00 2001
From: Abel Luck
Date: Wed, 1 Apr 2026 17:29:27 +0200
Subject: [PATCH 4/4] Update pygea dependency
---
uv.lock | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/uv.lock b/uv.lock
index 50e2792..857e52d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -936,7 +936,7 @@ wheels = [
[[package]]
name = "pygea"
version = "0.1.0"
-source = { git = "https://guardianproject.dev/anynews/pygea.git#bff04afbf612612108d9651f355d066c4b5f6a64" }
+source = { git = "https://guardianproject.dev/anynews/pygea.git#c58bac3abddc019c74a1de45835688461f39f2d0" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "feedgen" },