967 lines
30 KiB
Python
967 lines
30 KiB
Python
import sys
|
|
from pathlib import Path
|
|
from types import SimpleNamespace
|
|
from typing import Any, cast
|
|
|
|
import pytest
|
|
from scrapy.crawler import Crawler
|
|
from scrapy.http import Request, Response
|
|
|
|
from repub import media
|
|
from repub import settings as repub_settings
|
|
from repub.config import (
|
|
FeedConfig,
|
|
RepublisherConfig,
|
|
build_base_settings,
|
|
build_feed_settings,
|
|
)
|
|
from repub.items import ElementItem
|
|
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
|
|
from repub.utils import (
|
|
FileType,
|
|
local_audio_path,
|
|
local_video_path,
|
|
published_media_path,
|
|
)
|
|
|
|
|
|
def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
|
|
out_dir = (tmp_path / "mirror").resolve()
|
|
config = RepublisherConfig(
|
|
config_path=tmp_path / "repub.toml",
|
|
out_dir=out_dir,
|
|
feeds=(
|
|
FeedConfig(
|
|
name="NASA Breaking News",
|
|
slug="nasa",
|
|
url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
|
|
),
|
|
),
|
|
scrapy_settings={},
|
|
)
|
|
base_settings = build_base_settings(config)
|
|
settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
|
|
settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
|
|
return SimpleNamespace(settings=settings, request_fingerprinter=object())
|
|
|
|
|
|
def spider_info() -> Any:
|
|
return SimpleNamespace(spider=SimpleNamespace())
|
|
|
|
|
|
def store_dir(pipeline: Any) -> Path:
|
|
return Path(cast(Any, pipeline.store).basedir)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("pipeline_cls", "store_setting"),
|
|
[
|
|
(AudioPipeline, "AUDIO_STORE"),
|
|
(VideoPipeline, "VIDEO_STORE"),
|
|
(FilePipeline, "FILES_STORE"),
|
|
],
|
|
)
|
|
def test_pipeline_from_crawler_uses_configured_store(
|
|
tmp_path: Path, pipeline_cls, store_setting: str
|
|
) -> None:
|
|
crawler = build_test_crawler(tmp_path)
|
|
|
|
pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))
|
|
|
|
assert pipeline.settings is crawler.settings
|
|
assert store_dir(pipeline) == Path(crawler.settings[store_setting])
|
|
|
|
|
|
def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
|
|
input_file = tmp_path / "input.mp3"
|
|
input_file.write_bytes(b"12345")
|
|
output_dir = tmp_path / "audio-out"
|
|
output_dir.mkdir()
|
|
run_calls: list[dict[str, object]] = []
|
|
|
|
class FakeOutput:
|
|
def __init__(self, output_path: Path):
|
|
self.output_path = output_path
|
|
|
|
def run(self, **kwargs):
|
|
run_calls.append(kwargs)
|
|
self.output_path.write_bytes(b"12")
|
|
return b"", b""
|
|
|
|
class FakeInput:
|
|
def output(self, output_file: str, **params):
|
|
del params
|
|
return FakeOutput(Path(output_file))
|
|
|
|
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
|
|
|
|
result = media.transcode_audio(
|
|
str(input_file),
|
|
str(output_dir),
|
|
{"extension": "mp3", "acodec": "libmp3lame"},
|
|
)
|
|
|
|
assert result == str(output_dir / "converted.mp3")
|
|
assert run_calls == [{"capture_stdout": True, "capture_stderr": True}]
|
|
|
|
|
|
def test_transcode_video_two_pass_does_not_print_ffmpeg_output(
|
|
monkeypatch, tmp_path: Path
|
|
) -> None:
|
|
input_file = tmp_path / "input.mp4"
|
|
input_file.write_bytes(b"12345")
|
|
output_dir = tmp_path / "video-out"
|
|
output_dir.mkdir()
|
|
run_calls: list[dict[str, object]] = []
|
|
printed: list[tuple[tuple[object, ...], dict[str, object]]] = []
|
|
|
|
class FakeOutput:
|
|
def __init__(self, output_path: Path | None):
|
|
self.output_path = output_path
|
|
|
|
def global_args(self, *args):
|
|
del args
|
|
return self
|
|
|
|
def run(self, **kwargs):
|
|
run_calls.append(kwargs)
|
|
if self.output_path is not None:
|
|
self.output_path.write_bytes(b"12")
|
|
return b"pass-out", b"pass-err"
|
|
|
|
class FakeInput:
|
|
video = object()
|
|
audio = object()
|
|
|
|
def output(self, *args, **params):
|
|
del params
|
|
output_path = next(
|
|
(
|
|
Path(arg)
|
|
for arg in args
|
|
if isinstance(arg, str) and arg.endswith(".mp4")
|
|
),
|
|
None,
|
|
)
|
|
return FakeOutput(output_path)
|
|
|
|
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
|
|
monkeypatch.setattr(
|
|
"builtins.print", lambda *args, **kwargs: printed.append((args, kwargs))
|
|
)
|
|
|
|
result = media.transcode_video(
|
|
str(input_file),
|
|
str(output_dir),
|
|
{
|
|
"extension": "mp4",
|
|
"passes": [
|
|
{"f": "null"},
|
|
{"c:v": "libx264"},
|
|
],
|
|
},
|
|
)
|
|
|
|
assert result == str(output_dir / "converted.mp4")
|
|
assert run_calls == [
|
|
{"capture_stdout": True, "capture_stderr": True},
|
|
{
|
|
"capture_stdout": True,
|
|
"capture_stderr": True,
|
|
"overwrite_output": True,
|
|
},
|
|
]
|
|
assert printed == []
|
|
|
|
|
|
def test_transcode_video_prints_ffmpeg_output_on_error(
|
|
monkeypatch, tmp_path: Path
|
|
) -> None:
|
|
input_file = tmp_path / "input.mp4"
|
|
input_file.write_bytes(b"12345")
|
|
output_dir = tmp_path / "video-out"
|
|
output_dir.mkdir()
|
|
printed: list[tuple[str, bool]] = []
|
|
|
|
class FakeOutput:
|
|
def run(self, **kwargs):
|
|
del kwargs
|
|
raise media.ffmpeg.Error("ffmpeg", b"video-stdout", b"video-stderr")
|
|
|
|
class FakeInput:
|
|
def output(self, *args, **params):
|
|
del args, params
|
|
return FakeOutput()
|
|
|
|
def fake_print(*args, **kwargs):
|
|
printed.append((str(args[0]), kwargs.get("file") is sys.stderr))
|
|
|
|
monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
|
|
monkeypatch.setattr("builtins.print", fake_print)
|
|
|
|
with pytest.raises(RuntimeError):
|
|
media.transcode_video(
|
|
str(input_file),
|
|
str(output_dir),
|
|
{"extension": "mp4", "c:v": "libx264"},
|
|
)
|
|
|
|
assert ("video-stderr", True) in printed
|
|
assert ("video-stdout", False) in printed
|
|
|
|
|
|
def test_video_transcode_params_scales_to_max_height() -> None:
|
|
params = media.video_transcode_params(
|
|
{
|
|
"format": {"format_name": "mp4"},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "mpeg4",
|
|
"bit_rate": "2000000",
|
|
"duration_ts": "1",
|
|
"width": 1920,
|
|
"height": 1080,
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "aac",
|
|
"bit_rate": "128000",
|
|
"duration_ts": "1",
|
|
},
|
|
],
|
|
},
|
|
{
|
|
"name": "720",
|
|
"container": "mp4",
|
|
"vcodec": "h264",
|
|
"acodec": "mp3",
|
|
"audio_max_bitrate": 96000,
|
|
"ffmpeg_audio_params": {"acodec": "libmp3lame"},
|
|
"ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"},
|
|
"max_height": 720,
|
|
"mimetype": "video/mp4",
|
|
"extension": "mp4",
|
|
},
|
|
)
|
|
|
|
assert params == {
|
|
"extension": "mp4",
|
|
"vf": "scale=-2:720",
|
|
"vcodec": "h264",
|
|
"strict": "-2",
|
|
"acodec": "libmp3lame",
|
|
}
|
|
|
|
|
|
def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
|
|
params = media.video_transcode_params(
|
|
{
|
|
"format": {"format_name": "mp4"},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "mpeg4",
|
|
"bit_rate": "2000000",
|
|
"duration_ts": "1",
|
|
"width": 1920,
|
|
"height": 1080,
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "mp3",
|
|
"bit_rate": "128000",
|
|
"duration_ts": "1",
|
|
},
|
|
],
|
|
},
|
|
cast(
|
|
media.VideoSettings,
|
|
{
|
|
"name": "720",
|
|
"container": "webm",
|
|
"vcodec": "libvpx-vp9",
|
|
"acodec": "opus",
|
|
"audio_max_bitrate": 96000,
|
|
"ffmpeg_audio_params": {"c:a": "libopus", "b:a": "96k"},
|
|
"ffmpeg_video_params": {},
|
|
"max_height": 720,
|
|
"mimetype": "video/webm",
|
|
"extension": "webm",
|
|
"passes": [
|
|
{"c:v": "libvpx-vp9", "pass": "1", "f": "null"},
|
|
{"c:v": "libvpx-vp9", "pass": "2", "c:a": "libopus"},
|
|
],
|
|
},
|
|
),
|
|
)
|
|
|
|
assert params == {
|
|
"extension": "webm",
|
|
"passes": [
|
|
{
|
|
"c:v": "libvpx-vp9",
|
|
"pass": "1",
|
|
"f": "null",
|
|
"vf": "scale=-2:720",
|
|
},
|
|
{
|
|
"c:v": "libvpx-vp9",
|
|
"pass": "2",
|
|
"c:a": "libopus",
|
|
"vf": "scale=-2:720",
|
|
},
|
|
],
|
|
}
|
|
|
|
|
|
def test_audio_transcode_params_accepts_m4a_format_family() -> None:
|
|
params = media.audio_transcode_params(
|
|
{
|
|
"format": {
|
|
"bit_rate": "20000",
|
|
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "aac",
|
|
"bit_rate": "20000",
|
|
"duration_ts": "1",
|
|
}
|
|
],
|
|
},
|
|
cast(
|
|
media.AudioSettings,
|
|
{
|
|
"name": "m4a",
|
|
"format": "m4a",
|
|
"max_bitrate": 64000,
|
|
"mimetype": "audio/mp4",
|
|
"extension": "m4a",
|
|
"ffmpeg_audio_params": {
|
|
"acodec": "libfdk_aac",
|
|
"vbr": "2",
|
|
},
|
|
},
|
|
),
|
|
)
|
|
|
|
assert params is None
|
|
|
|
|
|
def test_audio_meta_handles_webm_without_duration_ts() -> None:
|
|
assert media.audio_meta(
|
|
{
|
|
"format": {
|
|
"duration": "1.0",
|
|
"size": "100",
|
|
"bit_rate": "48000",
|
|
"format_name": "matroska,webm",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "opus",
|
|
"sample_rate": "48000",
|
|
"channels": 1,
|
|
}
|
|
],
|
|
}
|
|
) == {
|
|
"duration": "1.0",
|
|
"fileSize": "100",
|
|
"bitrate": 48000,
|
|
"samplingrate": 48000,
|
|
"channels": 1,
|
|
}
|
|
|
|
|
|
def test_video_meta_handles_webm_without_duration_ts() -> None:
|
|
assert media.video_meta(
|
|
{
|
|
"format": {
|
|
"duration": "1.0",
|
|
"size": "200",
|
|
"bit_rate": "64000",
|
|
"format_name": "matroska,webm",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "vp9",
|
|
"width": 640,
|
|
"height": 360,
|
|
"avg_frame_rate": "25/1",
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "opus",
|
|
"sample_rate": "48000",
|
|
"channels": 1,
|
|
},
|
|
],
|
|
}
|
|
) == {
|
|
"duration": "1.0",
|
|
"fileSize": "200",
|
|
"width": 640,
|
|
"height": 360,
|
|
"bitrate": 64000,
|
|
"framerate": "25/1",
|
|
}
|
|
|
|
|
|
def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
|
monkeypatch, tmp_path: Path
|
|
) -> None:
|
|
crawler = build_test_crawler(tmp_path)
|
|
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
|
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
|
persisted: list[tuple[str, str]] = []
|
|
source_url = "https://example.com/podcast.mp3"
|
|
item = ElementItem(
|
|
feed_name="nasa",
|
|
el=None,
|
|
image_urls=[],
|
|
images=[],
|
|
file_urls=[],
|
|
files=[],
|
|
audio_urls=[source_url],
|
|
audios=[],
|
|
video_urls=[],
|
|
videos=[],
|
|
)
|
|
|
|
def fake_transcode(
|
|
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
|
) -> str:
|
|
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
|
output_path.write_bytes(settings["name"].encode("utf-8"))
|
|
return str(output_path)
|
|
|
|
audio_default_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
|
|
)
|
|
audio_m4a_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
|
|
)
|
|
audio_webm_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
|
|
)
|
|
|
|
def fake_probe_media(file_path: str):
|
|
file_name = Path(file_path).name
|
|
if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3":
|
|
return {
|
|
"format": {
|
|
"duration": "61.2",
|
|
"size": "4567",
|
|
"bit_rate": "37209",
|
|
"format_name": "mp3",
|
|
"format_long_name": "MP3",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "mp3",
|
|
"bit_rate": "37209",
|
|
"duration_ts": "61200",
|
|
"sample_rate": "48000",
|
|
"channels": 1,
|
|
}
|
|
],
|
|
}
|
|
if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a":
|
|
return {
|
|
"format": {
|
|
"duration": "61.2",
|
|
"size": "3456",
|
|
"bit_rate": "20746",
|
|
"format_name": "mov,mp4,m4a,3gp,3g2,mj2",
|
|
"format_long_name": "AAC",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "aac",
|
|
"bit_rate": "20746",
|
|
"duration_ts": "61200",
|
|
"sample_rate": "48000",
|
|
"channels": 1,
|
|
}
|
|
],
|
|
}
|
|
if (
|
|
file_path.endswith(audio_webm_path)
|
|
or file_name == "webm_opus_voice_48k.webm"
|
|
):
|
|
return {
|
|
"format": {
|
|
"duration": "61.2",
|
|
"size": "2345",
|
|
"bit_rate": "48000",
|
|
"format_name": "matroska,webm",
|
|
"format_long_name": "WebM",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "opus",
|
|
"sample_rate": "48000",
|
|
"channels": 1,
|
|
}
|
|
],
|
|
}
|
|
return {
|
|
"format": {
|
|
"duration": "61.2",
|
|
"size": "5678",
|
|
"bit_rate": "128000",
|
|
"format_name": "mp3",
|
|
"format_long_name": "MP3",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "mp3",
|
|
"bit_rate": "128000",
|
|
"duration_ts": "61200",
|
|
"sample_rate": "44100",
|
|
"channels": 2,
|
|
}
|
|
],
|
|
}
|
|
|
|
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
|
monkeypatch.setattr(media, "probe_media", fake_probe_media)
|
|
|
|
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
|
del info, meta
|
|
assert headers is not None
|
|
target = store_dir(pipeline) / path
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
target.write_bytes(buf.read())
|
|
persisted.append((path, headers["Content-Type"]))
|
|
|
|
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
|
|
|
result = pipeline.media_downloaded(
|
|
Response(url=source_url, body=b"source-bytes", status=200),
|
|
Request(source_url),
|
|
spider_info(),
|
|
item=item,
|
|
)
|
|
|
|
audio_base_path = local_audio_path(source_url)
|
|
assert isinstance(result, dict)
|
|
assert isinstance(result["checksum"], str)
|
|
assert result == {
|
|
"url": source_url,
|
|
"path": audio_default_path,
|
|
"published_url": (
|
|
f"https://mirror.example/feeds/nasa/audio/{audio_default_path}"
|
|
),
|
|
"checksum": result["checksum"],
|
|
"status": "downloaded",
|
|
"variants": [
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}",
|
|
"path": audio_default_path,
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "true",
|
|
"fileSize": "4567",
|
|
"bitrate": 37209,
|
|
"duration": "61.2",
|
|
"samplingrate": 48000,
|
|
"channels": 1,
|
|
},
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}",
|
|
"path": audio_m4a_path,
|
|
"type": "audio/mp4",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "3456",
|
|
"bitrate": 20746,
|
|
"duration": "61.2",
|
|
"samplingrate": 48000,
|
|
"channels": 1,
|
|
},
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}",
|
|
"path": audio_webm_path,
|
|
"type": "audio/webm",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "2345",
|
|
"bitrate": 48000,
|
|
"duration": "61.2",
|
|
"samplingrate": 48000,
|
|
"channels": 1,
|
|
},
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
|
|
"path": audio_base_path,
|
|
"type": "audio/mpeg",
|
|
"medium": "audio",
|
|
"isDefault": "false",
|
|
"fileSize": "5678",
|
|
"bitrate": 128000,
|
|
"duration": "61.2",
|
|
"samplingrate": 44100,
|
|
"channels": 2,
|
|
},
|
|
],
|
|
}
|
|
assert persisted == [
|
|
(audio_base_path, "audio/mpeg"),
|
|
(audio_default_path, "audio/mpeg"),
|
|
(audio_m4a_path, "audio/mp4"),
|
|
(audio_webm_path, "audio/webm"),
|
|
]
|
|
|
|
completed_item = pipeline.item_completed(
|
|
[(True, result)],
|
|
item,
|
|
spider_info(),
|
|
)
|
|
assert completed_item.audios == [result]
|
|
|
|
|
|
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
|
|
monkeypatch, tmp_path: Path
|
|
) -> None:
|
|
crawler = build_test_crawler(tmp_path)
|
|
pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
|
|
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
|
persisted: list[tuple[str, str]] = []
|
|
source_url = "https://example.com/video.mp4"
|
|
item = ElementItem(
|
|
feed_name="nasa",
|
|
el=None,
|
|
image_urls=[],
|
|
images=[],
|
|
file_urls=[],
|
|
files=[],
|
|
audio_urls=[],
|
|
audios=[],
|
|
video_urls=[source_url],
|
|
videos=[],
|
|
)
|
|
|
|
def fake_transcode(
|
|
input_file: str, settings: media.MediaSettings, tmp_dir: str
|
|
) -> str:
|
|
output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
|
|
output_path.write_bytes(settings["name"].encode("utf-8"))
|
|
return str(output_path)
|
|
|
|
monkeypatch.setattr(pipeline, "transcode", fake_transcode)
|
|
video_main_path = published_media_path(
|
|
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0]
|
|
)
|
|
video_fallback_path = published_media_path(
|
|
FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1]
|
|
)
|
|
|
|
def fake_probe_media(file_path: str):
|
|
file_name = Path(file_path).name
|
|
if file_path.endswith(video_main_path) or file_name == "main.mp4":
|
|
return {
|
|
"format": {
|
|
"duration": "60.0",
|
|
"size": "9876",
|
|
"bit_rate": "123456",
|
|
"format_name": "mp4",
|
|
"format_long_name": "MP4",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "h264",
|
|
"bit_rate": "123456",
|
|
"duration_ts": "60000",
|
|
"width": 1280,
|
|
"height": 720,
|
|
"avg_frame_rate": "30/1",
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "aac",
|
|
"bit_rate": "96000",
|
|
"duration_ts": "60000",
|
|
},
|
|
],
|
|
}
|
|
if file_path.endswith(video_fallback_path) or file_name == "fallback.webm":
|
|
return {
|
|
"format": {
|
|
"duration": "60.0",
|
|
"size": "6789",
|
|
"bit_rate": "64000",
|
|
"format_name": "matroska,webm",
|
|
"format_long_name": "WebM",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "vp9",
|
|
"width": 1280,
|
|
"height": 720,
|
|
"avg_frame_rate": "25/1",
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "opus",
|
|
},
|
|
],
|
|
}
|
|
return {
|
|
"format": {
|
|
"duration": "60.0",
|
|
"size": "12345",
|
|
"bit_rate": "456789",
|
|
"format_name": "mp4",
|
|
"format_long_name": "MP4",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "video",
|
|
"codec_name": "h264",
|
|
"bit_rate": "456789",
|
|
"duration_ts": "60000",
|
|
"width": 640,
|
|
"height": 360,
|
|
"avg_frame_rate": "24/1",
|
|
},
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": "mp3",
|
|
"bit_rate": "96000",
|
|
"duration_ts": "60000",
|
|
},
|
|
],
|
|
}
|
|
|
|
monkeypatch.setattr(media, "probe_media", fake_probe_media)
|
|
|
|
def fake_persist_file(path, buf, info, meta=None, headers=None):
|
|
del info, meta
|
|
assert headers is not None
|
|
target = store_dir(pipeline) / path
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
target.write_bytes(buf.read())
|
|
persisted.append((path, headers["Content-Type"]))
|
|
|
|
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
|
|
|
|
result = pipeline.media_downloaded(
|
|
Response(url=source_url, body=b"video-bytes", status=200),
|
|
Request(source_url),
|
|
spider_info(),
|
|
item=item,
|
|
)
|
|
|
|
video_base_path = local_video_path(source_url)
|
|
assert isinstance(result, dict)
|
|
assert isinstance(result["checksum"], str)
|
|
assert result == {
|
|
"url": source_url,
|
|
"path": video_main_path,
|
|
"published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"),
|
|
"checksum": result["checksum"],
|
|
"status": "downloaded",
|
|
"variants": [
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/video/{video_main_path}",
|
|
"path": video_main_path,
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "true",
|
|
"fileSize": "9876",
|
|
"bitrate": 123456,
|
|
"duration": "60.0",
|
|
"width": 1280,
|
|
"height": 720,
|
|
"framerate": "30/1",
|
|
},
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}",
|
|
"path": video_fallback_path,
|
|
"type": "video/webm",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"fileSize": "6789",
|
|
"bitrate": 64000,
|
|
"duration": "60.0",
|
|
"width": 1280,
|
|
"height": 720,
|
|
"framerate": "25/1",
|
|
},
|
|
{
|
|
"url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
|
|
"path": video_base_path,
|
|
"type": "video/mp4",
|
|
"medium": "video",
|
|
"isDefault": "false",
|
|
"fileSize": "12345",
|
|
"bitrate": 456789,
|
|
"duration": "60.0",
|
|
"width": 640,
|
|
"height": 360,
|
|
"framerate": "24/1",
|
|
},
|
|
],
|
|
}
|
|
assert persisted == [
|
|
(video_base_path, "video/mp4"),
|
|
(video_main_path, "video/mp4"),
|
|
(video_fallback_path, "video/webm"),
|
|
]
|
|
|
|
|
|
def test_audio_pipeline_media_to_download_checks_canonical_path(
|
|
monkeypatch, tmp_path: Path
|
|
) -> None:
|
|
crawler = build_test_crawler(tmp_path)
|
|
pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
|
|
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
|
|
source_url = "https://example.com/podcast.mp3"
|
|
audio_base_path = local_audio_path(source_url)
|
|
original_path = store_dir(pipeline) / audio_base_path
|
|
audio_default_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
|
|
)
|
|
audio_m4a_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
|
|
)
|
|
audio_webm_path = published_media_path(
|
|
FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
|
|
)
|
|
canonical_path = store_dir(pipeline) / audio_default_path
|
|
m4a_path = store_dir(pipeline) / audio_m4a_path
|
|
webm_path = store_dir(pipeline) / audio_webm_path
|
|
original_path.parent.mkdir(parents=True, exist_ok=True)
|
|
original_path.write_bytes(b"original")
|
|
canonical_path.parent.mkdir(parents=True, exist_ok=True)
|
|
canonical_path.write_bytes(b"default")
|
|
m4a_path.write_bytes(b"alt-aac")
|
|
webm_path.write_bytes(b"alt-webm")
|
|
stat_paths: list[str] = []
|
|
original_stat_file = pipeline.store.stat_file
|
|
item = ElementItem(
|
|
feed_name="nasa",
|
|
el=None,
|
|
image_urls=[],
|
|
images=[],
|
|
file_urls=[],
|
|
files=[],
|
|
audio_urls=[source_url],
|
|
audios=[],
|
|
video_urls=[],
|
|
videos=[],
|
|
)
|
|
|
|
def wrapped_stat_file(path, info):
|
|
stat_paths.append(path)
|
|
return original_stat_file(path, info)
|
|
|
|
monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
|
|
monkeypatch.setattr(
|
|
media,
|
|
"probe_media",
|
|
lambda file_path: {
|
|
"format": {
|
|
"duration": "61.2",
|
|
"size": (
|
|
"4567"
|
|
if file_path.endswith(audio_default_path)
|
|
else (
|
|
"3456"
|
|
if file_path.endswith(audio_m4a_path)
|
|
else "2345" if file_path.endswith(audio_webm_path) else "5678"
|
|
)
|
|
),
|
|
"bit_rate": (
|
|
"37209"
|
|
if file_path.endswith(audio_default_path)
|
|
else (
|
|
"20746"
|
|
if file_path.endswith(audio_m4a_path)
|
|
else (
|
|
"48000" if file_path.endswith(audio_webm_path) else "128000"
|
|
)
|
|
)
|
|
),
|
|
"format_name": (
|
|
"mp3"
|
|
if file_path.endswith(audio_default_path)
|
|
else (
|
|
"mov,mp4,m4a,3gp,3g2,mj2"
|
|
if file_path.endswith(audio_m4a_path)
|
|
else (
|
|
"matroska,webm"
|
|
if file_path.endswith(audio_webm_path)
|
|
else "mp3"
|
|
)
|
|
)
|
|
),
|
|
"format_long_name": "Audio",
|
|
},
|
|
"streams": [
|
|
{
|
|
"codec_type": "audio",
|
|
"codec_name": (
|
|
"mp3"
|
|
if file_path.endswith(audio_default_path)
|
|
else (
|
|
"aac"
|
|
if file_path.endswith(audio_m4a_path)
|
|
else (
|
|
"opus" if file_path.endswith(audio_webm_path) else "mp3"
|
|
)
|
|
)
|
|
),
|
|
"bit_rate": (
|
|
"37209"
|
|
if file_path.endswith(audio_default_path)
|
|
else (
|
|
"20746"
|
|
if file_path.endswith(audio_m4a_path)
|
|
else (
|
|
None
|
|
if file_path.endswith(audio_webm_path)
|
|
else "128000"
|
|
)
|
|
)
|
|
),
|
|
"duration_ts": (
|
|
None if file_path.endswith(audio_webm_path) else "61200"
|
|
),
|
|
"sample_rate": (
|
|
"44100" if file_path == str(original_path) else "48000"
|
|
),
|
|
"channels": 1 if file_path != str(original_path) else 2,
|
|
}
|
|
],
|
|
},
|
|
)
|
|
|
|
result = pipeline.media_to_download(
|
|
Request(source_url),
|
|
spider_info(),
|
|
item=item,
|
|
)
|
|
assert result is not None
|
|
assert result["path"] == audio_default_path
|
|
assert result["status"] == "uptodate"
|
|
assert [variant.get("path") for variant in result["variants"]] == [
|
|
audio_default_path,
|
|
audio_m4a_path,
|
|
audio_webm_path,
|
|
audio_base_path,
|
|
]
|
|
assert f"{audio_base_path}.mp3" not in stat_paths
|
|
assert stat_paths[0] == audio_default_path
|