republisher/tests/test_pipelines.py
Abel Luck 525393272e Replace image pipeline with profile-driven variants
- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
2026-05-27 09:24:22 +02:00

1209 lines
38 KiB
Python

import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast
import pytest
import pyvips
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from repub import media
from repub import settings as repub_settings
from repub.config import (
FeedConfig,
RepublisherConfig,
build_base_settings,
build_feed_settings,
)
from repub.items import ElementItem
from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImageNormalizePipeline,
ImageThumbnailPipeline,
VideoPipeline,
image_mimetype,
)
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_video_path,
published_image_path,
published_media_path,
source_image_path,
thumbnail_image_path,
)
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
out_dir = (tmp_path / "mirror").resolve()
config = RepublisherConfig(
config_path=tmp_path / "repub.toml",
out_dir=out_dir,
feeds=(
FeedConfig(
name="NASA Breaking News",
slug="nasa",
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
),
),
scrapy_settings={},
)
base_settings = build_base_settings(config)
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
return SimpleNamespace(settings=settings, request_fingerprinter=object())
class HashableSpiderInfo:
__hash__ = object.__hash__
def __init__(self) -> None:
self.spider = SimpleNamespace()
def spider_info() -> Any:
return HashableSpiderInfo()
def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir)
def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
(ImageNormalizePipeline, "IMAGES_STORE"),
(AudioPipeline, "AUDIO_STORE"),
(VideoPipeline, "VIDEO_STORE"),
(FilePipeline, "FILES_STORE"),
],
)
def test_pipeline_from_crawler_uses_configured_store(
tmp_path: Path, pipeline_cls, store_setting: str
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
assert pipeline.settings is crawler.settings
assert store_dir(pipeline) == Path(crawler.settings[store_setting])
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
input_file = tmp_path / "input.mp3"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "audio-out"
output_dir.mkdir()
run_calls: list[dict[str, object]] = []
class FakeOutput:
def __init__(self, output_path: Path):
self.output_path = output_path
def run(self, **kwargs):
run_calls.append(kwargs)
self.output_path.write_bytes(b"12")
return b"", b""
class FakeInput:
def output(self, output_file: str, **params):
del params
return FakeOutput(Path(output_file))
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
result = media.transcode_audio(
str(input_file),
str(output_dir),
{"extension": "mp3", "acodec": "libmp3lame"},
)
assert result == str(output_dir / "converted.mp3")
assert run_calls == [{"capture_stdout": True, "capture_stderr": True}]
def test_transcode_video_two_pass_does_not_print_ffmpeg_output(
monkeypatch, tmp_path: Path
) -> None:
input_file = tmp_path / "input.mp4"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "video-out"
output_dir.mkdir()
run_calls: list[dict[str, object]] = []
printed: list[tuple[tuple[object, ...], dict[str, object]]] = []
class FakeOutput:
def __init__(self, output_path: Path | None):
self.output_path = output_path
def global_args(self, *args):
del args
return self
def run(self, **kwargs):
run_calls.append(kwargs)
if self.output_path is not None:
self.output_path.write_bytes(b"12")
return b"pass-out", b"pass-err"
class FakeInput:
video = object()
audio = object()
def output(self, *args, **params):
del params
output_path = next(
(
Path(arg)
for arg in args
if isinstance(arg, str) and arg.endswith(".mp4")
),
None,
)
return FakeOutput(output_path)
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
monkeypatch.setattr(
"builtins.print", lambda *args, **kwargs: printed.append((args, kwargs))
)
result = media.transcode_video(
str(input_file),
str(output_dir),
{
"extension": "mp4",
"passes": [
{"f": "null"},
{"c:v": "libx264"},
],
},
)
assert result == str(output_dir / "converted.mp4")
assert run_calls == [
{"capture_stdout": True, "capture_stderr": True},
{
"capture_stdout": True,
"capture_stderr": True,
"overwrite_output": True,
},
]
assert printed == []
def test_transcode_video_prints_ffmpeg_output_on_error(
monkeypatch, tmp_path: Path
) -> None:
input_file = tmp_path / "input.mp4"
input_file.write_bytes(b"12345")
output_dir = tmp_path / "video-out"
output_dir.mkdir()
printed: list[tuple[str, bool]] = []
class FakeOutput:
def run(self, **kwargs):
del kwargs
raise media.ffmpeg.Error("ffmpeg", b"video-stdout", b"video-stderr")
class FakeInput:
def output(self, *args, **params):
del args, params
return FakeOutput()
def fake_print(*args, **kwargs):
printed.append((str(args[0]), kwargs.get("file") is sys.stderr))
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
monkeypatch.setattr("builtins.print", fake_print)
with pytest.raises(RuntimeError):
media.transcode_video(
str(input_file),
str(output_dir),
{"extension": "mp4", "c:v": "libx264"},
)
assert ("video-stderr", True) in printed
assert ("video-stdout", False) in printed
def test_video_transcode_params_scales_to_max_height() -> None:
params = media.video_transcode_params(
{
"format": {"format_name": "mp4"},
"streams": [
{
"codec_type": "video",
"codec_name": "mpeg4",
"bit_rate": "2000000",
"duration_ts": "1",
"width": 1920,
"height": 1080,
},
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "128000",
"duration_ts": "1",
},
],
},
{
"name": "720",
"container": "mp4",
"vcodec": "h264",
"acodec": "mp3",
"audio_max_bitrate": 96000,
"ffmpeg_audio_params": {"acodec": "libmp3lame"},
"ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"},
"max_height": 720,
"mimetype": "video/mp4",
"extension": "mp4",
},
)
assert params == {
"extension": "mp4",
"vf": "scale=-2:720",
"vcodec": "h264",
"strict": "-2",
"acodec": "libmp3lame",
}
def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
params = media.video_transcode_params(
{
"format": {"format_name": "mp4"},
"streams": [
{
"codec_type": "video",
"codec_name": "mpeg4",
"bit_rate": "2000000",
"duration_ts": "1",
"width": 1920,
"height": 1080,
},
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "128000",
"duration_ts": "1",
},
],
},
cast(
media.VideoSettings,
{
"name": "720",
"container": "webm",
"vcodec": "libvpx-vp9",
"acodec": "opus",
"audio_max_bitrate": 96000,
"ffmpeg_audio_params": {"c:a": "libopus", "b:a": "96k"},
"ffmpeg_video_params": {},
"max_height": 720,
"mimetype": "video/webm",
"extension": "webm",
"passes": [
{"c:v": "libvpx-vp9", "pass": "1", "f": "null"},
{"c:v": "libvpx-vp9", "pass": "2", "c:a": "libopus"},
],
},
),
)
assert params == {
"extension": "webm",
"passes": [
{
"c:v": "libvpx-vp9",
"pass": "1",
"f": "null",
"vf": "scale=-2:720",
},
{
"c:v": "libvpx-vp9",
"pass": "2",
"c:a": "libopus",
"vf": "scale=-2:720",
},
],
}
def test_audio_transcode_params_accepts_m4a_format_family() -> None:
params = media.audio_transcode_params(
{
"format": {
"bit_rate": "20000",
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "20000",
"duration_ts": "1",
}
],
},
cast(
media.AudioSettings,
{
"name": "m4a",
"format": "m4a",
"max_bitrate": 64000,
"mimetype": "audio/mp4",
"extension": "m4a",
"ffmpeg_audio_params": {
"acodec": "libfdk_aac",
"vbr": "2",
},
},
),
)
assert params is None
def test_audio_meta_handles_webm_without_duration_ts() -> None:
assert media.audio_meta(
{
"format": {
"duration": "1.0",
"size": "100",
"bit_rate": "48000",
"format_name": "matroska,webm",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
}
],
}
) == {
"duration": "1.0",
"fileSize": "100",
"bitrate": 48000,
"samplingrate": 48000,
"channels": 1,
}
def test_video_meta_handles_webm_without_duration_ts() -> None:
assert media.video_meta(
{
"format": {
"duration": "1.0",
"size": "200",
"bit_rate": "64000",
"format_name": "matroska,webm",
},
"streams": [
{
"codec_type": "video",
"codec_name": "vp9",
"width": 640,
"height": 360,
"avg_frame_rate": "25/1",
},
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
},
],
}
) == {
"duration": "1.0",
"fileSize": "200",
"width": 640,
"height": 360,
"bitrate": 64000,
"framerate": "25/1",
}
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/podcast.mp3"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
audio_default_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
)
audio_m4a_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
)
audio_webm_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
)
def fake_probe_media(file_path: str):
file_name = Path(file_path).name
if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3":
return {
"format": {
"duration": "61.2",
"size": "4567",
"bit_rate": "37209",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "37209",
"duration_ts": "61200",
"sample_rate": "48000",
"channels": 1,
}
],
}
if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a":
return {
"format": {
"duration": "61.2",
"size": "3456",
"bit_rate": "20746",
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
"format_long_name": "AAC",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "20746",
"duration_ts": "61200",
"sample_rate": "48000",
"channels": 1,
}
],
}
if (
file_path.endswith(audio_webm_path)
or file_name == "webm_opus_voice_48k.webm"
):
return {
"format": {
"duration": "61.2",
"size": "2345",
"bit_rate": "48000",
"format_name": "matroska,webm",
"format_long_name": "WebM",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "opus",
"sample_rate": "48000",
"channels": 1,
}
],
}
return {
"format": {
"duration": "61.2",
"size": "5678",
"bit_rate": "128000",
"format_name": "mp3",
"format_long_name": "MP3",
},
"streams": [
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "128000",
"duration_ts": "61200",
"sample_rate": "44100",
"channels": 2,
}
],
}
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"source-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
audio_base_path = local_audio_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": audio_default_path,
"published_url": (
f"https://mirror.example/feeds/nasa/audio/{audio_default_path}"
),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}",
"path": audio_default_path,
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "true",
"fileSize": "4567",
"bitrate": 37209,
"duration": "61.2",
"samplingrate": 48000,
"channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}",
"path": audio_m4a_path,
"type": "audio/mp4",
"medium": "audio",
"isDefault": "false",
"fileSize": "3456",
"bitrate": 20746,
"duration": "61.2",
"samplingrate": 48000,
"channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}",
"path": audio_webm_path,
"type": "audio/webm",
"medium": "audio",
"isDefault": "false",
"fileSize": "2345",
"bitrate": 48000,
"duration": "61.2",
"samplingrate": 48000,
"channels": 1,
},
{
"url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
"path": audio_base_path,
"type": "audio/mpeg",
"medium": "audio",
"isDefault": "false",
"fileSize": "5678",
"bitrate": 128000,
"duration": "61.2",
"samplingrate": 44100,
"channels": 2,
},
],
}
assert persisted == [
(audio_base_path, "audio/mpeg"),
(audio_default_path, "audio/mpeg"),
(audio_m4a_path, "audio/mp4"),
(audio_webm_path, "audio/webm"),
]
completed_item = pipeline.item_completed(
[(True, result)],
item,
spider_info(),
)
assert completed_item.audios == [result]
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
canonical_path = canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
source_path = source_image_path(source_url, "image/png")
webp_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][0],
)
jpeg_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][1],
)
source_body = transparent_png_bytes()
result = pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
webp_file_size = result["variants"][0].get("fileSize")
jpeg_file_size = result["variants"][1].get("fileSize")
assert result == {
"url": source_url,
"path": canonical_path,
"published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
"checksum": result["checksum"],
"status": "downloaded",
"source_path": source_path,
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
"path": webp_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": webp_file_size,
"width": 2,
"height": 3,
},
{
"url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
"path": jpeg_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": jpeg_file_size,
"width": 2,
"height": 3,
},
],
"thumbnails": [],
}
assert isinstance(result["checksum"], str)
assert isinstance(webp_file_size, int)
assert isinstance(jpeg_file_size, int)
assert (store_dir(pipeline) / source_path).read_bytes() == source_body
webp_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
)
jpeg_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
)
assert (webp_image.width, webp_image.height) == (2, 3)
assert (jpeg_image.width, jpeg_image.height) == (2, 3)
assert jpeg_image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
source_body = png_bytes(1200, 900)
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
normalized = normalize_pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
item.images = [normalized]
processed = thumbnail_pipeline.process_item(item, spider_info().spider)
thumbnails = processed.images[0]["thumbnails"]
thumb_slots = [thumb.get("slot") for thumb in thumbnails]
first_thumb = thumbnails[0]
second_thumb = thumbnails[1]
assert processed.images[0]["path"] == canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
assert thumb_slots == ["card_hero", "list_square"]
assert first_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
)
assert first_thumb.get("type") == "image/jpeg"
assert first_thumb.get("width") == 640
assert first_thumb.get("height") == 360
assert second_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
)
assert second_thumb.get("width") == 160
assert second_thumb.get("height") == 160
for thumb in thumbnails:
thumb_path = thumb.get("path")
thumb_width = thumb.get("width")
thumb_height = thumb.get("height")
thumb_image = cast(
Any,
pyvips.Image.new_from_file(
str(store_dir(normalize_pipeline) / str(thumb_path))
),
)
assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
downloaded = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)
assert downloaded["source_path"].endswith(".png")
assert uptodate is not None
assert uptodate["source_path"] == downloaded["source_path"]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, str]] = []
source_url = "https://example.com/video.mp4"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[source_url],
videos=[],
)
def fake_transcode(
input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> str:
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
output_path.write_bytes(settings["name"].encode("utf-8"))
return str(output_path)
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
video_main_path = published_media_path(
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0]
)
video_fallback_path = published_media_path(
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1]
)
def fake_probe_media(file_path: str):
file_name = Path(file_path).name
if file_path.endswith(video_main_path) or file_name == "main.mp4":
return {
"format": {
"duration": "60.0",
"size": "9876",
"bit_rate": "123456",
"format_name": "mp4",
"format_long_name": "MP4",
},
"streams": [
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": "123456",
"duration_ts": "60000",
"width": 1280,
"height": 720,
"avg_frame_rate": "30/1",
},
{
"codec_type": "audio",
"codec_name": "aac",
"bit_rate": "96000",
"duration_ts": "60000",
},
],
}
if file_path.endswith(video_fallback_path) or file_name == "fallback.webm":
return {
"format": {
"duration": "60.0",
"size": "6789",
"bit_rate": "64000",
"format_name": "matroska,webm",
"format_long_name": "WebM",
},
"streams": [
{
"codec_type": "video",
"codec_name": "vp9",
"width": 1280,
"height": 720,
"avg_frame_rate": "25/1",
},
{
"codec_type": "audio",
"codec_name": "opus",
},
],
}
return {
"format": {
"duration": "60.0",
"size": "12345",
"bit_rate": "456789",
"format_name": "mp4",
"format_long_name": "MP4",
},
"streams": [
{
"codec_type": "video",
"codec_name": "h264",
"bit_rate": "456789",
"duration_ts": "60000",
"width": 640,
"height": 360,
"avg_frame_rate": "24/1",
},
{
"codec_type": "audio",
"codec_name": "mp3",
"bit_rate": "96000",
"duration_ts": "60000",
},
],
}
monkeypatch.setattr(media, "probe_media", fake_probe_media)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info, meta
assert headers is not None
target = store_dir(pipeline) / path
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(buf.read())
persisted.append((path, headers["Content-Type"]))
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(url=source_url, body=b"video-bytes", status=200),
Request(source_url),
spider_info(),
item=item,
)
video_base_path = local_video_path(source_url)
assert isinstance(result, dict)
assert isinstance(result["checksum"], str)
assert result == {
"url": source_url,
"path": video_main_path,
"published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"),
"checksum": result["checksum"],
"status": "downloaded",
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/video/{video_main_path}",
"path": video_main_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "true",
"fileSize": "9876",
"bitrate": 123456,
"duration": "60.0",
"width": 1280,
"height": 720,
"framerate": "30/1",
},
{
"url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}",
"path": video_fallback_path,
"type": "video/webm",
"medium": "video",
"isDefault": "false",
"fileSize": "6789",
"bitrate": 64000,
"duration": "60.0",
"width": 1280,
"height": 720,
"framerate": "25/1",
},
{
"url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
"path": video_base_path,
"type": "video/mp4",
"medium": "video",
"isDefault": "false",
"fileSize": "12345",
"bitrate": 456789,
"duration": "60.0",
"width": 640,
"height": 360,
"framerate": "24/1",
},
],
}
assert persisted == [
(video_base_path, "video/mp4"),
(video_main_path, "video/mp4"),
(video_fallback_path, "video/webm"),
]
def test_audio_pipeline_media_to_download_checks_canonical_path(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/podcast.mp3"
audio_base_path = local_audio_path(source_url)
original_path = store_dir(pipeline) / audio_base_path
audio_default_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
)
audio_m4a_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
)
audio_webm_path = published_media_path(
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
)
canonical_path = store_dir(pipeline) / audio_default_path
m4a_path = store_dir(pipeline) / audio_m4a_path
webm_path = store_dir(pipeline) / audio_webm_path
original_path.parent.mkdir(parents=True, exist_ok=True)
original_path.write_bytes(b"original")
canonical_path.parent.mkdir(parents=True, exist_ok=True)
canonical_path.write_bytes(b"default")
m4a_path.write_bytes(b"alt-aac")
webm_path.write_bytes(b"alt-webm")
stat_paths: list[str] = []
original_stat_file = pipeline.store.stat_file
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[],
images=[],
file_urls=[],
files=[],
audio_urls=[source_url],
audios=[],
video_urls=[],
videos=[],
)
def wrapped_stat_file(path, info):
stat_paths.append(path)
return original_stat_file(path, info)
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
monkeypatch.setattr(
media,
"probe_media",
lambda file_path: {
"format": {
"duration": "61.2",
"size": (
"4567"
if file_path.endswith(audio_default_path)
else (
"3456"
if file_path.endswith(audio_m4a_path)
else "2345" if file_path.endswith(audio_webm_path) else "5678"
)
),
"bit_rate": (
"37209"
if file_path.endswith(audio_default_path)
else (
"20746"
if file_path.endswith(audio_m4a_path)
else (
"48000" if file_path.endswith(audio_webm_path) else "128000"
)
)
),
"format_name": (
"mp3"
if file_path.endswith(audio_default_path)
else (
"mov,mp4,m4a,3gp,3g2,mj2"
if file_path.endswith(audio_m4a_path)
else (
"matroska,webm"
if file_path.endswith(audio_webm_path)
else "mp3"
)
)
),
"format_long_name": "Audio",
},
"streams": [
{
"codec_type": "audio",
"codec_name": (
"mp3"
if file_path.endswith(audio_default_path)
else (
"aac"
if file_path.endswith(audio_m4a_path)
else (
"opus" if file_path.endswith(audio_webm_path) else "mp3"
)
)
),
"bit_rate": (
"37209"
if file_path.endswith(audio_default_path)
else (
"20746"
if file_path.endswith(audio_m4a_path)
else (
None
if file_path.endswith(audio_webm_path)
else "128000"
)
)
),
"duration_ts": (
None if file_path.endswith(audio_webm_path) else "61200"
),
"sample_rate": (
"44100" if file_path == str(original_path) else "48000"
),
"channels": 1 if file_path != str(original_path) else 2,
}
],
},
)
result = pipeline.media_to_download(
Request(source_url),
spider_info(),
item=item,
)
assert result is not None
assert result["path"] == audio_default_path
assert result["status"] == "uptodate"
assert [variant.get("path") for variant in result["variants"]] == [
audio_default_path,
audio_m4a_path,
audio_webm_path,
audio_base_path,
]
assert f"{audio_base_path}.mp3" not in stat_paths
assert stat_paths[0] == audio_default_path