Hash video profile paths

This commit is contained in:
Abel Luck 2026-03-31 15:01:49 +02:00
parent 954608c5f9
commit 2ad0536bb0
5 changed files with 180 additions and 28 deletions

View file

@ -170,6 +170,10 @@ def _frame_rate(stream: Dict[str, Any]) -> Optional[str]:
return None
def _scale_to_max_height(max_height: int) -> str:
return f"scale=-2:{max_height}"
def audio_meta(probe: Dict[str, Any]) -> Optional[AudioMeta]:
stream = primary_audio_stream(probe)
if not stream:
@ -303,7 +307,7 @@ def video_transcode_params(
params = {"extension": settings["extension"]}
if len(passes) == 0 or is_vcodec:
if not is_good_height:
params["vf"] = f"scale={width}:{height}"
params["vf"] = _scale_to_max_height(max_height)
if not is_vcodec:
params.update(settings["ffmpeg_video_params"])
if not is_acodec or not is_audio_bitrate:
@ -313,7 +317,7 @@ def video_transcode_params(
for p in passes:
p = copy.deepcopy(p)
if not is_good_height:
p["vf"] = f"scale={width}:{height}"
p["vf"] = _scale_to_max_height(max_height)
params["passes"].append(p)
return params

View file

@ -1,4 +1,5 @@
import hashlib
import json
import mimetypes
from enum import Enum
from pathlib import Path
@ -42,8 +43,23 @@ def local_audio_path(s: str) -> str:
return local_file_path(s)
def variant_media_path(base_path: str, profile: Mapping[str, Any]) -> str:
return f"{base_path}-{profile['name']}.{profile['extension']}"
def profile_settings_hash(profile: Mapping[str, Any]) -> str:
settings = {
key: value
for key, value in profile.items()
if key not in {"name", "mimetype", "extension"}
}
payload = json.dumps(settings, sort_keys=True, separators=(",", ":"))
return hashlib.sha1(to_bytes(payload)).hexdigest()[:8] # nosec
def variant_media_path(
base_path: str, profile: Mapping[str, Any], *, hashed: bool = False
) -> str:
profile_name = str(profile["name"])
if hashed:
profile_name = f"{profile_name}-{profile_settings_hash(profile)}"
return f"{base_path}-{profile_name}.{profile['extension']}"
def published_media_path(
@ -52,7 +68,7 @@ def published_media_path(
if file_type == FileType.AUDIO:
return variant_media_path(local_audio_path(source_url), profile)
if file_type == FileType.VIDEO:
return variant_media_path(local_video_path(source_url), profile)
return variant_media_path(local_video_path(source_url), profile, hashed=True)
raise ValueError(f"Unsupported file type for published media path: {file_type}")

View file

@ -135,10 +135,10 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
item.videos = [
{
"url": source_video,
"path": f"{video_base_path}-720.mp4",
"path": f"{video_base_path}-720-457f0928.mp4",
"published_url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
f"video/{video_base_path}-720-457f0928.mp4",
),
"checksum": "video-default",
"status": "downloaded",
@ -146,9 +146,9 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
{
"url": _published_url(
"https://mirror.example",
f"video/{video_base_path}-720.mp4",
f"video/{video_base_path}-720-457f0928.mp4",
),
"path": f"{video_base_path}-720.mp4",
"path": f"{video_base_path}-720-457f0928.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@ -323,7 +323,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
{
"url": (
f"https://mirror.example/feeds/demo/video/"
f"{local_video_path(source_video)}-720.mp4"
f"{local_video_path(source_video)}-720-457f0928.mp4"
),
"type": "video/mp4",
"medium": "video",

View file

@ -6,7 +6,13 @@ from scrapy.settings import Settings
from repub import entrypoint as entrypoint_module
from repub import settings as repub_settings
from repub.spiders.rss_spider import RssFeedSpider
from repub.utils import FileType, local_audio_path, local_image_path, local_video_path
from repub.utils import (
FileType,
local_audio_path,
local_image_path,
local_video_path,
published_media_path,
)
def test_entrypoint_supports_file_feed_urls(tmp_path: Path, monkeypatch) -> None:
@ -67,14 +73,32 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
)
== f"audio/{local_audio_path('https://example.com/media/podcast.mp3')}-vbr7.mp3"
)
assert (
spider.rewrite_file_url(
assert spider.rewrite_file_url(
FileType.VIDEO,
"https://example.com/media/clip.mp4",
) == (
"video/"
f"{local_video_path('https://example.com/media/clip.mp4')}"
"-720-457f0928.mp4"
)
== f"video/{local_video_path('https://example.com/media/clip.mp4')}-720.mp4"
def test_published_video_path_changes_when_profile_args_change() -> None:
source_url = "https://example.com/media/clip.mp4"
base_profile = repub_settings.REPUBLISHER_VIDEO[0]
assert published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
) == (f"{local_audio_path(source_url)}-vbr7.mp3")
assert published_media_path(FileType.VIDEO, source_url, base_profile) == (
f"{local_video_path(source_url)}-720-457f0928.mp4"
)
changed_profile = {**base_profile, "max_height": 1080}
assert published_media_path(
FileType.VIDEO, source_url, changed_profile
) != published_media_path(FileType.VIDEO, source_url, base_profile)
def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
feed_text = """<?xml version="1.0" encoding="UTF-8"?>

View file

@ -204,6 +204,111 @@ def test_transcode_video_prints_ffmpeg_output_on_error(
assert ("video-stdout", False) in printed
def test_video_transcode_params_scales_to_max_height() -> None:
params = media.video_transcode_params(
{
"format": {"format_name": "mp4"},
"streams": [
{
"codec_type": "video",
"codec_name": "mpeg4",
"bit_rate": "2000000",
"duration_ts": "1",
"width": 1920,
"height": 1080,
},
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "128000",
"duration_ts": "1",
},
],
},
{
"name": "720",
"container": "mp4",
"vcodec": "h264",
"acodec": "mp3",
"audio_max_bitrate": 96000,
"ffmpeg_audio_params": {"acodec": "libmp3lame"},
"ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"},
"max_height": 720,
"mimetype": "video/mp4",
"extension": "mp4",
},
)
assert params == {
"extension": "mp4",
"vf": "scale=-2:720",
"vcodec": "h264",
"strict": "-2",
"acodec": "libmp3lame",
}
def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
params = media.video_transcode_params(
{
"format": {"format_name": "mp4"},
"streams": [
{
"codec_type": "video",
"codec_name": "mpeg4",
"bit_rate": "2000000",
"duration_ts": "1",
"width": 1920,
"height": 1080,
},
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "128000",
"duration_ts": "1",
},
],
},
cast(
media.VideoSettings,
{
"name": "720",
"container": "webm",
"vcodec": "libvpx-vp9",
"acodec": "opus",
"audio_max_bitrate": 96000,
"ffmpeg_audio_params": {"c:a": "libopus", "b:a": "96k"},
"ffmpeg_video_params": {},
"max_height": 720,
"mimetype": "video/webm",
"extension": "webm",
"passes": [
{"c:v": "libvpx-vp9", "pass": "1", "f": "null"},
{"c:v": "libvpx-vp9", "pass": "2", "c:a": "libopus"},
],
},
),
)
assert params == {
"extension": "webm",
"passes": [
{
"c:v": "libvpx-vp9",
"pass": "1",
"f": "null",
"vf": "scale=-2:720",
},
{
"c:v": "libvpx-vp9",
"pass": "2",
"c:a": "libopus",
"vf": "scale=-2:720",
},
],
}
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
@ -410,6 +515,7 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
transcoded_suffix = "-720-457f0928.mp4"
monkeypatch.setattr(
media,
"probe_media",
@ -418,12 +524,12 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"duration": "60.0",
"size": (
"12345"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "9876"
),
"bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "123456"
),
"format_name": "mp4",
@ -435,23 +541,23 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
"codec_name": "h264",
"bit_rate": (
"456789"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "123456"
),
"duration_ts": "60000",
"width": (
640
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else 1280
),
"height": (
360
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else 720
),
"avg_frame_rate": (
"24/1"
if _.endswith(".mp4") and not _.endswith("-720.mp4")
if _.endswith(".mp4") and not _.endswith(transcoded_suffix)
else "30/1"
),
},
@ -487,18 +593,20 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": f"{video_base_path}-720.mp4",
"path": f"{video_base_path}-720-457f0928.mp4",
"published_url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
"https://mirror.example/feeds/nasa/video/"
f"{video_base_path}-720-457f0928.mp4"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": (
f"https://mirror.example/feeds/nasa/video/{video_base_path}-720.mp4"
"https://mirror.example/feeds/nasa/video/"
f"{video_base_path}-720-457f0928.mp4"
),
"path": f"{video_base_path}-720.mp4",
"path": f"{video_base_path}-720-457f0928.mp4",
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
@ -526,7 +634,7 @@ def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variant
}
assert persisted == [
(video_base_path, "video/mp4"),
(f"{video_base_path}-720.mp4", "video/mp4"),
(f"{video_base_path}-720-457f0928.mp4", "video/mp4"),
]