Fix published paths for transcoded media

This commit is contained in:
Abel Luck 2026-03-31 14:14:46 +02:00
parent 3f33994cdc
commit 89d462e280
9 changed files with 956 additions and 114 deletions

View file

@ -3,22 +3,34 @@ from __future__ import annotations
import re
from email.utils import parsedate_to_datetime
from io import BytesIO
from typing import Callable
import lxml.etree as etree
from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub import settings as repub_settings
from repub.exporters import RssExporter
from repub.items import ElementItem
from repub.rss import nsmap
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import local_audio_path, local_file_path, local_image_path
from repub.utils import local_audio_path, local_image_path, local_video_path
RSS_DATE_PATTERN = re.compile(
r"^[A-Z][a-z]{2}, \d{2} [A-Z][a-z]{2} \d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}$"
)
def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Element]:
def _published_url(feed_url: str, path: str) -> str:
return f"{feed_url}/feeds/demo/{path}"
def _serialize_feed(
*,
feed_text: str,
feed_url: str,
prepare_item: Callable[[ElementItem], None] | None = None,
) -> tuple[str, etree._Element]:
spider = RssFeedSpider(feed_name="demo", url="https://source.example/feed.rss")
spider.settings = Settings(
values={
@ -26,6 +38,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
"REPUBLISHER_FEED_URL": feed_url,
}
)
@ -39,6 +53,8 @@ def _serialize_feed(*, feed_text: str, feed_url: str) -> tuple[str, etree._Eleme
exporter = RssExporter(output)
exporter.start_exporting()
for item in list(spider._parse(response) or []):
if prepare_item is not None and isinstance(item, ElementItem):
prepare_item(item)
exporter.export_item(item)
exporter.finish_exporting()
@ -53,8 +69,88 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
source_video = "https://source.example/media/video.mp4"
channel_image = "https://source.example/media/channel.png"
item_image = "https://source.example/media/cover.jpg"
def prepare_item(item: ElementItem) -> None:
audio_base_path = local_audio_path(source_audio)
video_base_path = local_video_path(source_video)
item.audios = [
{
"url": source_audio,
"path": f"{audio_base_path}-vbr7.mp3",
"published_url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr7.mp3",
),
"checksum": "audio-default",
"status": "downloaded",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr7.mp3",
),
"path": f"{audio_base_path}-vbr7.mp3",
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": "96000",
"duration": "61.2",
"samplingrate": "44100",
"channels": "2",
},
{
"url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}-vbr3.aac",
),
"path": f"{audio_base_path}-vbr3.aac",
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": "88000",
"duration": "61.2",
"samplingrate": "48000",
"channels": "2",
},
],
}
]
item.videos = [
{
"url": source_video,
"path": f"{video_base_path}-720.mp4",
"published_url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
),
"checksum": "video-default",
"status": "downloaded",
"variants": [
{
"url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
),
"path": f"{video_base_path}-720.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"fileSize": "9876",
"bitrate": "123456",
"duration": "60.0",
"width": "1280",
"height": "720",
"framerate": "30/1",
}
],
}
]
xml, root = _serialize_feed(
feed_url="https://mirror.example",
prepare_item=prepare_item,
feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
@ -130,25 +226,73 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
enclosure = root.find("./channel/item/enclosure")
assert enclosure is not None
assert enclosure.attrib == {
"url": f"https://mirror.example/feeds/demo/audio/{local_audio_path(source_audio)}",
"length": "123",
"type": "audio/mpeg",
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}-vbr7.mp3"
),
"length": "4567",
"type": "audio/mp3",
}
assert len(enclosure) == 0
media_content = root.find("./channel/item/media:content", namespaces=nsmap)
assert media_content is not None
assert media_content.attrib == {
"url": f"https://mirror.example/feeds/demo/video/{local_file_path(source_video)}",
"type": "video/mp4",
"medium": "video",
"expression": "full",
"duration": "60",
"width": "640",
"height": "360",
"lang": "en",
}
assert len(media_content) == 0
assert root.find("./channel/item/media:content", namespaces=nsmap) is None
media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
assert len(media_groups) == 2
audio_group, video_group = media_groups
audio_variants = audio_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in audio_variants] == [
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}-vbr7.mp3"
),
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"bitrate": "96000",
"samplingrate": "44100",
"channels": "2",
"duration": "61.2",
"fileSize": "4567",
},
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}-vbr3.aac"
),
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"bitrate": "88000",
"samplingrate": "48000",
"channels": "2",
"duration": "61.2",
"fileSize": "3456",
},
]
video_variants = video_group.findall("media:content", namespaces=nsmap)
assert [variant.attrib for variant in video_variants] == [
{
"url": (
f"https://mirror.example/feeds/demo/video/"
f"{local_video_path(source_video)}-720.mp4"
),
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"expression": "full",
"bitrate": "123456",
"framerate": "30/1",
"duration": "60.0",
"height": "720",
"width": "1280",
"lang": "en",
"fileSize": "9876",
}
]
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
assert itunes_image is not None

View file

@ -4,8 +4,9 @@ from scrapy.http import TextResponse
from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module
from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import FileType, local_audio_path, local_image_path
from repub.utils import FileType, local_audio_path, local_image_path, local_video_path
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -50,6 +51,8 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
@ -62,7 +65,14 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
FileType.AUDIO,
"https://example.com/media/podcast.mp3",
)
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}"
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
)
assert (
spider.rewrite_file_url(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
)
== f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
)
@ -91,6 +101,8 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
"REPUBLISHER_FILE_DIR": "files",
"REPUBLISHER_AUDIO_DIR": "audio",
"REPUBLISHER_VIDEO_DIR": "video",
"REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
"REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
}
)
response = TextResponse(

View file

@ -1,8 +1,11 @@
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast
import pytest
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from repub import media
from repub.config import (
@ -11,7 +14,9 @@ from repub.config import (
build_base_settings,
build_feed_settings,
)
from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
from repub.utils import local_audio_path, local_video_path
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
@ -30,9 +35,18 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
)
base_settings = build_base_settings(config)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
return SimpleNamespace(settings=settings, request_fingerprinter=object())
def spider_info() -> Any:
return SimpleNamespace(spider=SimpleNamespace())
def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir)
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
@ -46,10 +60,10 @@ def test_pipeline_from_crawler_uses_configured_store(
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = pipeline_cls.from_crawler(crawler)
pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
assert pipeline.settings is crawler.settings
assert pipeline.store.basedir == crawler.settings[store_setting]
assert store_dir(pipeline) == Path(crawler.settings[store_setting])
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
@ -188,3 +202,327 @@ def test_transcode_video_prints_ffmpeg_output_on_error(
assert ("video-stderr", True) in printed
assert ("video-stdout", False) in printed
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/podcast.mp3"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
def fake_probe_media(file_path: str):
if file_path.endswith("vbr7.mp3"):
return {
"format": {
"duration": "61.2",
"size": "4567",
"bit_rate": "96000",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
}
],
}
return {
"format": {
"duration": "61.2",
"size": "3456",
"bit_rate": "88000",
"format_name": "aac",
"format_long_name": "AAC",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "88000",
"duration_ts": "61200",
"sample_rate": "48000",
"channels": 2,
}
],
}
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"source-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
audio_base_path = local_audio_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{audio_base_path}-vbr7.mp3",
"published_url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr7.mp3"
),
"path": f"{audio_base_path}-vbr7.mp3",
"type": "audio/mp3",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": 96000,
"duration": "61.2",
"samplingrate": 44100,
"channels": 2,
},
{
"url": (
f"https://mirror.example/feeds/nasa/audio/{audio_base_path}-vbr3.aac"
),
"path": f"{audio_base_path}-vbr3.aac",
"type": "audio/aac",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": 88000,
"duration": "61.2",
"samplingrate": 48000,
"channels": 2,
},
],
}
assert persisted == [
(f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
(f"{audio_base_path}-vbr3.aac", "audio/aac"),
]
completed_item = pipeline.item_completed(
[(True, result)],
item,
spider_info(),
)
assert completed_item.audios == [result]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/video.mp4"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[source_url],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
monkeypatch.setattr(
media,
"probe_media",
lambda _: {
"format": {
"duration": "60.0",
"size": "9876",
"bit_rate": "123456",
"format_name": "mp4",
"format_long_name": "MP4",
},
"streams": [
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": "123456",
"duration_ts": "60000",
"width": 1280,
"height": 720,
"avg_frame_rate": "30/1",
},
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"duration_ts": "60000",
},
],
},
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"video-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
video_base_path = local_video_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{video_base_path}-720.mp4",
"published_url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
),
"path": f"{video_base_path}-720.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"fileSize": "9876",
"bitrate": 123456,
"duration": "60.0",
"width": 1280,
"height": 720,
"framerate": "30/1",
}
],
}
assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")]
def test_audio_pipeline_media_to_download_checks_canonical_path(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url)
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default")
secondary_path.write_bytes(b"alt")
stat_paths: list[str] = []
original_stat_file = pipeline.store.stat_file
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def wrapped_stat_file(path, info):
stat_paths.append(path)
return original_stat_file(path, info)
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
monkeypatch.setattr(
media,
"probe_media",
lambda file_path: {
"format": {
"duration": "61.2",
"size": "4567" if file_path.endswith("vbr7.mp3") else "3456",
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
"format_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
"format_long_name": "Audio",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3" if file_path.endswith("vbr7.mp3") else "aac",
"bit_rate": "96000" if file_path.endswith("vbr7.mp3") else "88000",
"duration_ts": "61200",
"sample_rate": (
"44100" if file_path.endswith("vbr7.mp3") else "48000"
),
"channels": 2,
}
],
},
)
result = pipeline.media_to_download(
Request(source_url),
spider_info(),
item=item,
)
assert result is not None
assert result["path"] == f"{audio_base_path}-vbr7.mp3"
assert result["status"] == "uptodate"
assert f"{audio_base_path}.mp3" not in stat_paths
assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"