Include original media in media groups

This commit is contained in:
Abel Luck 2026-03-31 14:33:49 +02:00
parent 89d462e280
commit 954608c5f9
3 changed files with 212 additions and 14 deletions

View file

@ -1,5 +1,6 @@
import hashlib import hashlib
import logging import logging
import mimetypes
import tempfile import tempfile
import time import time
from io import BytesIO from io import BytesIO
@ -116,6 +117,26 @@ class TranscodePipeline(BaseFilesPipeline):
for index, setting in enumerate(settings) for index, setting in enumerate(settings)
] ]
def original_path(self, source_url: str) -> str:
if self.media_type == repub.utils.FileType.AUDIO:
return repub.utils.local_audio_path(source_url)
if self.media_type == repub.utils.FileType.VIDEO:
return repub.utils.local_video_path(source_url)
raise ValueError(f"Unsupported media type: {self.media_type}")
def original_mimetype(self, source_url: str, response=None) -> str:
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
mimetype = mimetypes.guess_type(source_url)[0]
if mimetype:
return mimetype
return {
repub.utils.FileType.AUDIO: "audio/mpeg",
repub.utils.FileType.VIDEO: "video/mp4",
}[self.media_type]
def published_url(self, path: str, item=None) -> str: def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.media_dir()}/{path}" relative_path = f"{self.media_dir()}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
@ -130,7 +151,7 @@ class TranscodePipeline(BaseFilesPipeline):
self, self,
*, *,
path: str, path: str,
setting: media.MediaSettings, mimetype: str,
probe_result: dict[str, Any], probe_result: dict[str, Any],
is_default: bool, is_default: bool,
item=None, item=None,
@ -138,7 +159,7 @@ class TranscodePipeline(BaseFilesPipeline):
variant: MediaVariant = { variant: MediaVariant = {
"url": self.published_url(path, item), "url": self.published_url(path, item),
"path": path, "path": path,
"type": setting["mimetype"], "type": mimetype,
"medium": self.media_type.value, "medium": self.media_type.value,
"isDefault": "true" if is_default else "false", "isDefault": "true" if is_default else "false",
} }
@ -158,12 +179,24 @@ class TranscodePipeline(BaseFilesPipeline):
variants.append( variants.append(
self.media_variant( self.media_variant(
path=path, path=path,
setting=setting, mimetype=setting["mimetype"],
probe_result=probe_result, probe_result=probe_result,
is_default=is_default, is_default=is_default,
item=item, item=item,
) )
) )
original_path = self.original_path(request.url)
original_file = self.local_store_path(original_path)
if original_file.exists():
variants.append(
self.media_variant(
path=original_path,
mimetype=self.original_mimetype(request.url),
probe_result=media.probe_media(str(original_file)),
is_default=False,
item=item,
)
)
return variants return variants
def make_file_result( def make_file_result(
@ -201,6 +234,11 @@ class TranscodePipeline(BaseFilesPipeline):
for _, _, path in self.variant_paths(request.url): for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)): if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.original_path(request.url), info),
):
return None
self.inc_stats("uptodate") self.inc_stats("uptodate")
return self.make_file_result( return self.make_file_result(
request, request,
@ -218,6 +256,23 @@ class TranscodePipeline(BaseFilesPipeline):
tmp_file = f"{tmp_dir}/original" tmp_file = f"{tmp_dir}/original"
with open(tmp_file, "wb") as f: with open(tmp_file, "wb") as f:
f.write(response.body) f.write(response.body)
original_path = self.original_path(request.url)
if not cast(
dict[str, Any] | None,
self.store.stat_file(original_path, info),
):
original_buf = read_asset(tmp_file)
self.store.persist_file(
original_path,
original_buf,
info,
meta=self.get_media_meta(media.probe_media(tmp_file)),
headers={
"Content-Type": self.original_mimetype(
request.url, response=response
)
},
)
for _, setting, final_path in self.variant_paths(request.url): for _, setting, final_path in self.variant_paths(request.url):
stat = cast( stat = cast(
dict[str, Any] | None, dict[str, Any] | None,

View file

@ -114,6 +114,21 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"samplingrate": "48000", "samplingrate": "48000",
"channels": "2", "channels": "2",
}, },
{
"url": _published_url(
"https://mirror.example",
f"audio/{audio_base_path}",
),
"path": audio_base_path,
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "false",
"fileSize": "5678",
"bitrate": "128000",
"duration": "61.2",
"samplingrate": "44100",
"channels": "2",
},
], ],
} }
] ]
@ -143,7 +158,23 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"width": "1280", "width": "1280",
"height": "720", "height": "720",
"framerate": "30/1", "framerate": "30/1",
} },
{
"url": _published_url(
"https://mirror.example",
f"video/{video_base_path}",
),
"path": video_base_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "false",
"fileSize": "12345",
"bitrate": "456789",
"duration": "60.0",
"width": "640",
"height": "360",
"framerate": "24/1",
},
], ],
} }
] ]
@ -271,6 +302,20 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"duration": "61.2", "duration": "61.2",
"fileSize": "3456", "fileSize": "3456",
}, },
{
"url": (
f"https://mirror.example/feeds/demo/audio/"
f"{local_audio_path(source_audio)}"
),
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "false",
"bitrate": "128000",
"samplingrate": "44100",
"channels": "2",
"duration": "61.2",
"fileSize": "5678",
},
] ]
video_variants = video_group.findall("media:content", namespaces=nsmap) video_variants = video_group.findall("media:content", namespaces=nsmap)
@ -291,7 +336,24 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
"width": "1280", "width": "1280",
"lang": "en", "lang": "en",
"fileSize": "9876", "fileSize": "9876",
} },
{
"url": (
f"https://mirror.example/feeds/demo/video/"
f"{local_video_path(source_video)}"
),
"type": "video/mp4",
"medium": "video",
"isDefault": "false",
"expression": "full",
"bitrate": "456789",
"framerate": "24/1",
"duration": "60.0",
"height": "360",
"width": "640",
"lang": "en",
"fileSize": "12345",
},
] ]
itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)

View file

@ -233,7 +233,7 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
return str(output_path) return str(output_path)
def fake_probe_media(file_path: str): def fake_probe_media(file_path: str):
if file_path.endswith("vbr7.mp3"): if file_path.endswith(".mp3-vbr7.mp3"):
return { return {
"format": { "format": {
"duration": "61.2", "duration": "61.2",
@ -253,6 +253,26 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
} }
], ],
} }
if file_path.endswith(".mp3"):
return {
"format": {
"duration": "61.2",
"size": "5678",
"bit_rate": "128000",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "128000",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
}
],
}
return { return {
"format": { "format": {
"duration": "61.2", "duration": "61.2",
@ -333,9 +353,22 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"samplingrate": 48000, "samplingrate": 48000,
"channels": 2, "channels": 2,
}, },
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
"path": audio_base_path,
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "false",
"fileSize": "5678",
"bitrate": 128000,
"duration": "61.2",
"samplingrate": 44100,
"channels": 2,
},
], ],
} }
assert persisted == [ assert persisted == [
(audio_base_path, "audio/mpeg"),
(f"{audio_base_path}-vbr7.mp3", "audio/mp3"), (f"{audio_base_path}-vbr7.mp3", "audio/mp3"),
(f"{audio_base_path}-vbr3.aac", "audio/aac"), (f"{audio_base_path}-vbr3.aac", "audio/aac"),
] ]
@ -383,8 +416,16 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
lambda _: { lambda _: {
"format": { "format": {
"duration": "60.0", "duration": "60.0",
"size": "9876", "size": (
"bit_rate": "123456", "12345"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
else "9876"
),
"bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
else "123456"
),
"format_name": "mp4", "format_name": "mp4",
"format_long_name": "MP4", "format_long_name": "MP4",
}, },
@ -392,11 +433,27 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
{ {
"codec_type": "video", "codec_type": "video",
"codec_name": "h264", "codec_name": "h264",
"bit_rate": "123456", "bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
else "123456"
),
"duration_ts": "60000", "duration_ts": "60000",
"width": 1280, "width": (
"height": 720, 640
"avg_frame_rate": "30/1", if _.endswith(".mp4") and not _.endswith("-720.mp4")
else 1280
),
"height": (
360
if _.endswith(".mp4") and not _.endswith("-720.mp4")
else 720
),
"avg_frame_rate": (
"24/1"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
else "30/1"
),
}, },
{ {
"codec_type": "audio", "codec_type": "audio",
@ -451,10 +508,26 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"width": 1280, "width": 1280,
"height": 720, "height": 720,
"framerate": "30/1", "framerate": "30/1",
} },
{
"url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
"path": video_base_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "false",
"fileSize": "12345",
"bitrate": 456789,
"duration": "60.0",
"width": 640,
"height": 360,
"framerate": "24/1",
},
], ],
} }
assert persisted == [(f"{video_base_path}-720.mp4", "video/mp4")] assert persisted == [
(video_base_path, "video/mp4"),
(f"{video_base_path}-720.mp4", "video/mp4"),
]
def test_audio_pipeline_media_to_download_checks_canonical_path( def test_audio_pipeline_media_to_download_checks_canonical_path(
@ -465,8 +538,11 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/podcast.mp3" source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url) audio_base_path = local_audio_path(source_url)
original_path = store_dir(pipeline) / audio_base_path
canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3" canonical_path = store_dir(pipeline) / f"{audio_base_path}-vbr7.mp3"
secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac" secondary_path = store_dir(pipeline) / f"{audio_base_path}-vbr3.aac"
original_path.parent.mkdir(parents=True, exist_ok=True)
original_path.write_bytes(b"original")
canonical_path.parent.mkdir(parents=True, exist_ok=True) canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default") canonical_path.write_bytes(b"default")
secondary_path.write_bytes(b"alt") secondary_path.write_bytes(b"alt")
@ -524,5 +600,10 @@ def test_audio_pipeline_media_to_download_checks_canonical_path(
assert result is not None assert result is not None
assert result["path"] == f"{audio_base_path}-vbr7.mp3" assert result["path"] == f"{audio_base_path}-vbr7.mp3"
assert result["status"] == "uptodate" assert result["status"] == "uptodate"
assert [variant.get("path") for variant in result["variants"]] == [
f"{audio_base_path}-vbr7.mp3",
f"{audio_base_path}-vbr3.aac",
audio_base_path,
]
assert f"{audio_base_path}.mp3" not in stat_paths assert f"{audio_base_path}.mp3" not in stat_paths
assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3" assert stat_paths[0] == f"{audio_base_path}-vbr7.mp3"