republisher/tests/test_pipelines.py

import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast

import pytest
import pyvips
from scrapy.crawler import Crawler
from scrapy.http import Request, Response

from repub import media
from repub import settings as repub_settings
from repub.config import (
    FeedConfig,
    RepublisherConfig,
    build_base_settings,
    build_feed_settings,
)
from repub.items import ElementItem
from repub.pipelines import (
    AudioPipeline,
    FilePipeline,
    ImageNormalizePipeline,
    ImageThumbnailPipeline,
    VideoPipeline,
    image_mimetype,
)
from repub.utils import (
    FileType,
    canonical_published_image_path,
    local_audio_path,
    local_video_path,
    published_image_path,
    published_media_path,
    source_image_path,
    thumbnail_image_path,
)


def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
    out_dir = (tmp_path / "mirror").resolve()
    config = RepublisherConfig(
        config_path=tmp_path / "repub.toml",
        out_dir=out_dir,
        feeds=(
            FeedConfig(
                name="NASA Breaking News",
                slug="nasa",
                url="https://www.nasa.gov/rss/dyn/breaking_news.rss",
            ),
        ),
        scrapy_settings={},
    )
    base_settings = build_base_settings(config)
    settings = build_feed_settings(base_settings, out_dir=out_dir, feed_slug="nasa")
    settings.set("REPUBLISHER_FEED_URL", "https://mirror.example", priority="cmdline")
    return SimpleNamespace(settings=settings, request_fingerprinter=object())


class HashableSpiderInfo:
    __hash__ = object.__hash__

    def __init__(self) -> None:
        self.spider = SimpleNamespace()


def spider_info() -> Any:
    return HashableSpiderInfo()


def store_dir(pipeline: Any) -> Path:
    return Path(cast(Any, pipeline.store).basedir)


def transparent_png_bytes() -> bytes:
    return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()


def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
    return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()


@pytest.mark.parametrize(
    ("pipeline_cls", "store_setting"),
    [
        (ImageNormalizePipeline, "IMAGES_STORE"),
        (AudioPipeline, "AUDIO_STORE"),
        (VideoPipeline, "VIDEO_STORE"),
        (FilePipeline, "FILES_STORE"),
    ],
)
def test_pipeline_from_crawler_uses_configured_store(
    tmp_path: Path, pipeline_cls, store_setting: str
) -> None:
    crawler = build_test_crawler(tmp_path)

    pipeline = pipeline_cls.from_crawler(cast(Crawler, crawler))

    assert pipeline.settings is crawler.settings
    assert store_dir(pipeline) == Path(crawler.settings[store_setting])


def test_transcode_audio_captures_ffmpeg_output(monkeypatch, tmp_path: Path) -> None:
    input_file = tmp_path / "input.mp3"
    input_file.write_bytes(b"12345")
    output_dir = tmp_path / "audio-out"
    output_dir.mkdir()
    run_calls: list[dict[str, object]] = []

    class FakeOutput:
        def __init__(self, output_path: Path):
            self.output_path = output_path

        def run(self, **kwargs):
            run_calls.append(kwargs)
            self.output_path.write_bytes(b"12")
            return b"", b""

    class FakeInput:
        def output(self, output_file: str, **params):
            del params
            return FakeOutput(Path(output_file))

    monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())

    result = media.transcode_audio(
        str(input_file),
        str(output_dir),
        {"extension": "mp3", "acodec": "libmp3lame"},
    )

    assert result == str(output_dir / "converted.mp3")
    assert run_calls == [{"capture_stdout": True, "capture_stderr": True}]


def test_transcode_video_two_pass_does_not_print_ffmpeg_output(
    monkeypatch, tmp_path: Path
) -> None:
    input_file = tmp_path / "input.mp4"
    input_file.write_bytes(b"12345")
    output_dir = tmp_path / "video-out"
    output_dir.mkdir()
    run_calls: list[dict[str, object]] = []
    printed: list[tuple[tuple[object, ...], dict[str, object]]] = []

    class FakeOutput:
        def __init__(self, output_path: Path | None):
            self.output_path = output_path

        def global_args(self, *args):
            del args
            return self

        def run(self, **kwargs):
            run_calls.append(kwargs)
            if self.output_path is not None:
                self.output_path.write_bytes(b"12")
            return b"pass-out", b"pass-err"

    class FakeInput:
        video = object()
        audio = object()

        def output(self, *args, **params):
            del params
            output_path = next(
                (
                    Path(arg)
                    for arg in args
                    if isinstance(arg, str) and arg.endswith(".mp4")
                ),
                None,
            )
            return FakeOutput(output_path)

    monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
    monkeypatch.setattr(
        "builtins.print", lambda *args, **kwargs: printed.append((args, kwargs))
    )

    result = media.transcode_video(
        str(input_file),
        str(output_dir),
        {
            "extension": "mp4",
            "passes": [
                {"f": "null"},
                {"c:v": "libx264"},
            ],
        },
    )

    assert result == str(output_dir / "converted.mp4")
    assert run_calls == [
        {"capture_stdout": True, "capture_stderr": True},
        {
            "capture_stdout": True,
            "capture_stderr": True,
            "overwrite_output": True,
        },
    ]
    assert printed == []


def test_transcode_video_prints_ffmpeg_output_on_error(
    monkeypatch, tmp_path: Path
) -> None:
    input_file = tmp_path / "input.mp4"
    input_file.write_bytes(b"12345")
    output_dir = tmp_path / "video-out"
    output_dir.mkdir()
    printed: list[tuple[str, bool]] = []

    class FakeOutput:
        def run(self, **kwargs):
            del kwargs
            raise media.ffmpeg.Error("ffmpeg", b"video-stdout", b"video-stderr")

    class FakeInput:
        def output(self, *args, **params):
            del args, params
            return FakeOutput()

    def fake_print(*args, **kwargs):
        printed.append((str(args[0]), kwargs.get("file") is sys.stderr))

    monkeypatch.setattr(media.ffmpeg, "input", lambda _: FakeInput())
    monkeypatch.setattr("builtins.print", fake_print)

    with pytest.raises(RuntimeError):
        media.transcode_video(
            str(input_file),
            str(output_dir),
            {"extension": "mp4", "c:v": "libx264"},
        )

    assert ("video-stderr", True) in printed
    assert ("video-stdout", False) in printed


def test_video_transcode_params_scales_to_max_height() -> None:
    params = media.video_transcode_params(
        {
            "format": {"format_name": "mp4"},
            "streams": [
                {
                    "codec_type": "video",
                    "codec_name": "mpeg4",
                    "bit_rate": "2000000",
                    "duration_ts": "1",
                    "width": 1920,
                    "height": 1080,
                },
                {
                    "codec_type": "audio",
                    "codec_name": "aac",
                    "bit_rate": "128000",
                    "duration_ts": "1",
                },
            ],
        },
        {
            "name": "720",
            "container": "mp4",
            "vcodec": "h264",
            "acodec": "mp3",
            "audio_max_bitrate": 96000,
            "ffmpeg_audio_params": {"acodec": "libmp3lame"},
            "ffmpeg_video_params": {"vcodec": "h264", "strict": "-2"},
            "max_height": 720,
            "mimetype": "video/mp4",
            "extension": "mp4",
        },
    )

    assert params == {
        "extension": "mp4",
        "vf": "scale=-2:720",
        "vcodec": "h264",
        "strict": "-2",
        "acodec": "libmp3lame",
    }


def test_video_transcode_params_scales_to_max_height_for_multipass() -> None:
    params = media.video_transcode_params(
        {
            "format": {"format_name": "mp4"},
            "streams": [
                {
                    "codec_type": "video",
                    "codec_name": "mpeg4",
                    "bit_rate": "2000000",
                    "duration_ts": "1",
                    "width": 1920,
                    "height": 1080,
                },
                {
                    "codec_type": "audio",
                    "codec_name": "mp3",
                    "bit_rate": "128000",
                    "duration_ts": "1",
                },
            ],
        },
        cast(
            media.VideoSettings,
            {
                "name": "720",
                "container": "webm",
                "vcodec": "libvpx-vp9",
                "acodec": "opus",
                "audio_max_bitrate": 96000,
                "ffmpeg_audio_params": {"c:a": "libopus", "b:a": "96k"},
                "ffmpeg_video_params": {},
                "max_height": 720,
                "mimetype": "video/webm",
                "extension": "webm",
                "passes": [
                    {"c:v": "libvpx-vp9", "pass": "1", "f": "null"},
                    {"c:v": "libvpx-vp9", "pass": "2", "c:a": "libopus"},
                ],
            },
        ),
    )

    assert params == {
        "extension": "webm",
        "passes": [
            {
                "c:v": "libvpx-vp9",
                "pass": "1",
                "f": "null",
                "vf": "scale=-2:720",
            },
            {
                "c:v": "libvpx-vp9",
                "pass": "2",
                "c:a": "libopus",
                "vf": "scale=-2:720",
            },
        ],
    }


def test_audio_transcode_params_accepts_m4a_format_family() -> None:
    params = media.audio_transcode_params(
        {
            "format": {
                "bit_rate": "20000",
                "format_name": "mov,mp4,m4a,3gp,3g2,mj2",
            },
            "streams": [
                {
                    "codec_type": "audio",
                    "codec_name": "aac",
                    "bit_rate": "20000",
                    "duration_ts": "1",
                }
            ],
        },
        cast(
            media.AudioSettings,
            {
                "name": "m4a",
                "format": "m4a",
                "max_bitrate": 64000,
                "mimetype": "audio/mp4",
                "extension": "m4a",
                "ffmpeg_audio_params": {
                    "acodec": "libfdk_aac",
                    "vbr": "2",
                },
            },
        ),
    )

    assert params is None


def test_audio_meta_handles_webm_without_duration_ts() -> None:
    assert media.audio_meta(
        {
            "format": {
                "duration": "1.0",
                "size": "100",
                "bit_rate": "48000",
                "format_name": "matroska,webm",
            },
            "streams": [
                {
                    "codec_type": "audio",
                    "codec_name": "opus",
                    "sample_rate": "48000",
                    "channels": 1,
                }
            ],
        }
    ) == {
        "duration": "1.0",
        "fileSize": "100",
        "bitrate": 48000,
        "samplingrate": 48000,
        "channels": 1,
    }


def test_video_meta_handles_webm_without_duration_ts() -> None:
    assert media.video_meta(
        {
            "format": {
                "duration": "1.0",
                "size": "200",
                "bit_rate": "64000",
                "format_name": "matroska,webm",
            },
            "streams": [
                {
                    "codec_type": "video",
                    "codec_name": "vp9",
                    "width": 640,
                    "height": 360,
                    "avg_frame_rate": "25/1",
                },
                {
                    "codec_type": "audio",
                    "codec_name": "opus",
                    "sample_rate": "48000",
                    "channels": 1,
                },
            ],
        }
    ) == {
        "duration": "1.0",
        "fileSize": "200",
        "width": 640,
        "height": 360,
        "bitrate": 64000,
        "framerate": "25/1",
    }


def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    persisted: list[tuple[str, str]] = []
    source_url = "https://example.com/podcast.mp3"
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[source_url],
        audios=[],
        video_urls=[],
        videos=[],
    )

    def fake_transcode(
        input_file: str, settings: media.MediaSettings, tmp_dir: str
    ) -> str:
        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
        output_path.write_bytes(settings["name"].encode("utf-8"))
        return str(output_path)

    audio_default_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
    )
    audio_m4a_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
    )
    audio_webm_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
    )

    def fake_probe_media(file_path: str):
        file_name = Path(file_path).name
        if file_path.endswith(audio_default_path) or file_name == "mp3_vbr7_voice.mp3":
            return {
                "format": {
                    "duration": "61.2",
                    "size": "4567",
                    "bit_rate": "37209",
                    "format_name": "mp3",
                    "format_long_name": "MP3",
                },
                "streams": [
                    {
                        "codec_type": "audio",
                        "codec_name": "mp3",
                        "bit_rate": "37209",
                        "duration_ts": "61200",
                        "sample_rate": "48000",
                        "channels": 1,
                    }
                ],
            }
        if file_path.endswith(audio_m4a_path) or file_name == "m4a_aac_vbr2_voice.m4a":
            return {
                "format": {
                    "duration": "61.2",
                    "size": "3456",
                    "bit_rate": "20746",
                    "format_name": "mov,mp4,m4a,3gp,3g2,mj2",
                    "format_long_name": "AAC",
                },
                "streams": [
                    {
                        "codec_type": "audio",
                        "codec_name": "aac",
                        "bit_rate": "20746",
                        "duration_ts": "61200",
                        "sample_rate": "48000",
                        "channels": 1,
                    }
                ],
            }
        if (
            file_path.endswith(audio_webm_path)
            or file_name == "webm_opus_voice_48k.webm"
        ):
            return {
                "format": {
                    "duration": "61.2",
                    "size": "2345",
                    "bit_rate": "48000",
                    "format_name": "matroska,webm",
                    "format_long_name": "WebM",
                },
                "streams": [
                    {
                        "codec_type": "audio",
                        "codec_name": "opus",
                        "sample_rate": "48000",
                        "channels": 1,
                    }
                ],
            }
        return {
            "format": {
                "duration": "61.2",
                "size": "5678",
                "bit_rate": "128000",
                "format_name": "mp3",
                "format_long_name": "MP3",
            },
            "streams": [
                {
                    "codec_type": "audio",
                    "codec_name": "mp3",
                    "bit_rate": "128000",
                    "duration_ts": "61200",
                    "sample_rate": "44100",
                    "channels": 2,
                }
            ],
        }

    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
    monkeypatch.setattr(media, "probe_media", fake_probe_media)

    def fake_persist_file(path, buf, info, meta=None, headers=None):
        del info, meta
        assert headers is not None
        target = store_dir(pipeline) / path
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_bytes(buf.read())
        persisted.append((path, headers["Content-Type"]))

    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)

    result = pipeline.media_downloaded(
        Response(url=source_url, body=b"source-bytes", status=200),
        Request(source_url),
        spider_info(),
        item=item,
    )

    audio_base_path = local_audio_path(source_url)
    assert isinstance(result, dict)
    assert isinstance(result["checksum"], str)
    assert result == {
        "url": source_url,
        "path": audio_default_path,
        "published_url": (
            f"https://mirror.example/feeds/nasa/audio/{audio_default_path}"
        ),
        "checksum": result["checksum"],
        "status": "downloaded",
        "variants": [
            {
                "url": f"https://mirror.example/feeds/nasa/audio/{audio_default_path}",
                "path": audio_default_path,
                "type": "audio/mpeg",
                "medium": "audio",
                "isDefault": "true",
                "fileSize": "4567",
                "bitrate": 37209,
                "duration": "61.2",
                "samplingrate": 48000,
                "channels": 1,
            },
            {
                "url": f"https://mirror.example/feeds/nasa/audio/{audio_m4a_path}",
                "path": audio_m4a_path,
                "type": "audio/mp4",
                "medium": "audio",
                "isDefault": "false",
                "fileSize": "3456",
                "bitrate": 20746,
                "duration": "61.2",
                "samplingrate": 48000,
                "channels": 1,
            },
            {
                "url": f"https://mirror.example/feeds/nasa/audio/{audio_webm_path}",
                "path": audio_webm_path,
                "type": "audio/webm",
                "medium": "audio",
                "isDefault": "false",
                "fileSize": "2345",
                "bitrate": 48000,
                "duration": "61.2",
                "samplingrate": 48000,
                "channels": 1,
            },
            {
                "url": f"https://mirror.example/feeds/nasa/audio/{audio_base_path}",
                "path": audio_base_path,
                "type": "audio/mpeg",
                "medium": "audio",
                "isDefault": "false",
                "fileSize": "5678",
                "bitrate": 128000,
                "duration": "61.2",
                "samplingrate": 44100,
                "channels": 2,
            },
        ],
    }
    assert persisted == [
        (audio_base_path, "audio/mpeg"),
        (audio_default_path, "audio/mpeg"),
        (audio_m4a_path, "audio/mp4"),
        (audio_webm_path, "audio/webm"),
    ]

    completed_item = pipeline.item_completed(
        [(True, result)],
        item,
        spider_info(),
    )
    assert completed_item.audios == [result]


def test_image_mimetype_does_not_guess_from_url_extension() -> None:
    assert image_mimetype(url="https://example.com/photo.jpg") is None


def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    source_url = "https://example.com/photo.png"
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[source_url],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[],
        audios=[],
        video_urls=[],
        videos=[],
    )
    canonical_path = canonical_published_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE"],
    )
    source_path = source_image_path(source_url, "image/png")
    webp_path = published_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE"][0],
    )
    jpeg_path = published_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE"][1],
    )
    source_body = transparent_png_bytes()

    result = pipeline.media_downloaded(
        Response(
            url=source_url,
            body=source_body,
            status=200,
            headers={"Content-Type": "image/png"},
        ),
        Request(source_url),
        spider_info(),
        item=item,
    )
    webp_file_size = result["variants"][0].get("fileSize")
    jpeg_file_size = result["variants"][1].get("fileSize")

    assert result == {
        "url": source_url,
        "path": canonical_path,
        "published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
        "checksum": result["checksum"],
        "status": "downloaded",
        "source_path": source_path,
        "variants": [
            {
                "url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
                "path": webp_path,
                "type": "image/webp",
                "medium": "image",
                "isDefault": "true",
                "fileSize": webp_file_size,
                "width": 2,
                "height": 3,
            },
            {
                "url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
                "path": jpeg_path,
                "type": "image/jpeg",
                "medium": "image",
                "isDefault": "false",
                "fileSize": jpeg_file_size,
                "width": 2,
                "height": 3,
            },
        ],
        "thumbnails": [],
    }
    assert isinstance(result["checksum"], str)
    assert isinstance(webp_file_size, int)
    assert isinstance(jpeg_file_size, int)
    assert (store_dir(pipeline) / source_path).read_bytes() == source_body
    webp_image = cast(
        Any,
        pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
    )
    jpeg_image = cast(
        Any,
        pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
    )
    assert (webp_image.width, webp_image.height) == (2, 3)
    assert (jpeg_image.width, jpeg_image.height) == (2, 3)
    assert jpeg_image.bands == 3

    completed_item = pipeline.item_completed([(True, result)], item, spider_info())
    assert completed_item.images == [result]


def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
    thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
    source_url = "https://example.com/photo.png"
    source_body = png_bytes(1200, 900)
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[source_url],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[],
        audios=[],
        video_urls=[],
        videos=[],
    )

    normalized = normalize_pipeline.media_downloaded(
        Response(
            url=source_url,
            body=source_body,
            status=200,
            headers={"Content-Type": "image/png"},
        ),
        Request(source_url),
        spider_info(),
        item=item,
    )
    item.images = [normalized]

    processed = thumbnail_pipeline.process_item(item, spider_info().spider)
    thumbnails = processed.images[0]["thumbnails"]
    thumb_slots = [thumb.get("slot") for thumb in thumbnails]
    first_thumb = thumbnails[0]
    second_thumb = thumbnails[1]

    assert processed.images[0]["path"] == canonical_published_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE"],
    )
    assert thumb_slots == ["card_hero", "list_square"]
    assert first_thumb.get("path") == thumbnail_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
    )
    assert first_thumb.get("type") == "image/jpeg"
    assert first_thumb.get("width") == 640
    assert first_thumb.get("height") == 360
    assert second_thumb.get("path") == thumbnail_image_path(
        source_url,
        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
    )
    assert second_thumb.get("width") == 160
    assert second_thumb.get("height") == 160
    for thumb in thumbnails:
        thumb_path = thumb.get("path")
        thumb_width = thumb.get("width")
        thumb_height = thumb.get("height")
        thumb_image = cast(
            Any,
            pyvips.Image.new_from_file(
                str(store_dir(normalize_pipeline) / str(thumb_path))
            ),
        )
        assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)


def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    source_url = "https://example.com/photo"
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[source_url],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[],
        audios=[],
        video_urls=[],
        videos=[],
    )

    downloaded = pipeline.media_downloaded(
        Response(
            url=source_url,
            body=transparent_png_bytes(),
            status=200,
            headers={"Content-Type": "image/png"},
        ),
        Request(source_url),
        spider_info(),
        item=item,
    )

    uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)

    assert downloaded["source_path"].endswith(".png")
    assert uptodate is not None
    assert uptodate["source_path"] == downloaded["source_path"]


def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    pipeline = VideoPipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    persisted: list[tuple[str, str]] = []
    source_url = "https://example.com/video.mp4"
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[],
        audios=[],
        video_urls=[source_url],
        videos=[],
    )

    def fake_transcode(
        input_file: str, settings: media.MediaSettings, tmp_dir: str
    ) -> str:
        output_path = Path(tmp_dir) / f"{settings['name']}.{settings['extension']}"
        output_path.write_bytes(settings["name"].encode("utf-8"))
        return str(output_path)

    monkeypatch.setattr(pipeline, "transcode", fake_transcode)
    video_main_path = published_media_path(
        FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[0]
    )
    video_fallback_path = published_media_path(
        FileType.VIDEO, source_url, repub_settings.REPUBLISHER_VIDEO[1]
    )

    def fake_probe_media(file_path: str):
        file_name = Path(file_path).name
        if file_path.endswith(video_main_path) or file_name == "main.mp4":
            return {
                "format": {
                    "duration": "60.0",
                    "size": "9876",
                    "bit_rate": "123456",
                    "format_name": "mp4",
                    "format_long_name": "MP4",
                },
                "streams": [
                    {
                        "codec_type": "video",
                        "codec_name": "h264",
                        "bit_rate": "123456",
                        "duration_ts": "60000",
                        "width": 1280,
                        "height": 720,
                        "avg_frame_rate": "30/1",
                    },
                    {
                        "codec_type": "audio",
                        "codec_name": "aac",
                        "bit_rate": "96000",
                        "duration_ts": "60000",
                    },
                ],
            }
        if file_path.endswith(video_fallback_path) or file_name == "fallback.webm":
            return {
                "format": {
                    "duration": "60.0",
                    "size": "6789",
                    "bit_rate": "64000",
                    "format_name": "matroska,webm",
                    "format_long_name": "WebM",
                },
                "streams": [
                    {
                        "codec_type": "video",
                        "codec_name": "vp9",
                        "width": 1280,
                        "height": 720,
                        "avg_frame_rate": "25/1",
                    },
                    {
                        "codec_type": "audio",
                        "codec_name": "opus",
                    },
                ],
            }
        return {
            "format": {
                "duration": "60.0",
                "size": "12345",
                "bit_rate": "456789",
                "format_name": "mp4",
                "format_long_name": "MP4",
            },
            "streams": [
                {
                    "codec_type": "video",
                    "codec_name": "h264",
                    "bit_rate": "456789",
                    "duration_ts": "60000",
                    "width": 640,
                    "height": 360,
                    "avg_frame_rate": "24/1",
                },
                {
                    "codec_type": "audio",
                    "codec_name": "mp3",
                    "bit_rate": "96000",
                    "duration_ts": "60000",
                },
            ],
        }

    monkeypatch.setattr(media, "probe_media", fake_probe_media)

    def fake_persist_file(path, buf, info, meta=None, headers=None):
        del info, meta
        assert headers is not None
        target = store_dir(pipeline) / path
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_bytes(buf.read())
        persisted.append((path, headers["Content-Type"]))

    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)

    result = pipeline.media_downloaded(
        Response(url=source_url, body=b"video-bytes", status=200),
        Request(source_url),
        spider_info(),
        item=item,
    )

    video_base_path = local_video_path(source_url)
    assert isinstance(result, dict)
    assert isinstance(result["checksum"], str)
    assert result == {
        "url": source_url,
        "path": video_main_path,
        "published_url": (f"https://mirror.example/feeds/nasa/video/{video_main_path}"),
        "checksum": result["checksum"],
        "status": "downloaded",
        "variants": [
            {
                "url": f"https://mirror.example/feeds/nasa/video/{video_main_path}",
                "path": video_main_path,
                "type": "video/mp4",
                "medium": "video",
                "isDefault": "true",
                "fileSize": "9876",
                "bitrate": 123456,
                "duration": "60.0",
                "width": 1280,
                "height": 720,
                "framerate": "30/1",
            },
            {
                "url": f"https://mirror.example/feeds/nasa/video/{video_fallback_path}",
                "path": video_fallback_path,
                "type": "video/webm",
                "medium": "video",
                "isDefault": "false",
                "fileSize": "6789",
                "bitrate": 64000,
                "duration": "60.0",
                "width": 1280,
                "height": 720,
                "framerate": "25/1",
            },
            {
                "url": f"https://mirror.example/feeds/nasa/video/{video_base_path}",
                "path": video_base_path,
                "type": "video/mp4",
                "medium": "video",
                "isDefault": "false",
                "fileSize": "12345",
                "bitrate": 456789,
                "duration": "60.0",
                "width": 640,
                "height": 360,
                "framerate": "24/1",
            },
        ],
    }
    assert persisted == [
        (video_base_path, "video/mp4"),
        (video_main_path, "video/mp4"),
        (video_fallback_path, "video/webm"),
    ]


def test_audio_pipeline_media_to_download_checks_canonical_path(
    monkeypatch, tmp_path: Path
) -> None:
    crawler = build_test_crawler(tmp_path)
    pipeline = AudioPipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
    source_url = "https://example.com/podcast.mp3"
    audio_base_path = local_audio_path(source_url)
    original_path = store_dir(pipeline) / audio_base_path
    audio_default_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[0]
    )
    audio_m4a_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[1]
    )
    audio_webm_path = published_media_path(
        FileType.AUDIO, source_url, repub_settings.REPUBLISHER_AUDIO[2]
    )
    canonical_path = store_dir(pipeline) / audio_default_path
    m4a_path = store_dir(pipeline) / audio_m4a_path
    webm_path = store_dir(pipeline) / audio_webm_path
    original_path.parent.mkdir(parents=True, exist_ok=True)
    original_path.write_bytes(b"original")
    canonical_path.parent.mkdir(parents=True, exist_ok=True)
    canonical_path.write_bytes(b"default")
    m4a_path.write_bytes(b"alt-aac")
    webm_path.write_bytes(b"alt-webm")
    stat_paths: list[str] = []
    original_stat_file = pipeline.store.stat_file
    item = ElementItem(
        feed_name="nasa",
        el=None,
        image_urls=[],
        images=[],
        file_urls=[],
        files=[],
        audio_urls=[source_url],
        audios=[],
        video_urls=[],
        videos=[],
    )

    def wrapped_stat_file(path, info):
        stat_paths.append(path)
        return original_stat_file(path, info)

    monkeypatch.setattr(pipeline.store, "stat_file", wrapped_stat_file)
    monkeypatch.setattr(
        media,
        "probe_media",
        lambda file_path: {
            "format": {
                "duration": "61.2",
                "size": (
                    "4567"
                    if file_path.endswith(audio_default_path)
                    else (
                        "3456"
                        if file_path.endswith(audio_m4a_path)
                        else "2345" if file_path.endswith(audio_webm_path) else "5678"
                    )
                ),
                "bit_rate": (
                    "37209"
                    if file_path.endswith(audio_default_path)
                    else (
                        "20746"
                        if file_path.endswith(audio_m4a_path)
                        else (
                            "48000" if file_path.endswith(audio_webm_path) else "128000"
                        )
                    )
                ),
                "format_name": (
                    "mp3"
                    if file_path.endswith(audio_default_path)
                    else (
                        "mov,mp4,m4a,3gp,3g2,mj2"
                        if file_path.endswith(audio_m4a_path)
                        else (
                            "matroska,webm"
                            if file_path.endswith(audio_webm_path)
                            else "mp3"
                        )
                    )
                ),
                "format_long_name": "Audio",
            },
            "streams": [
                {
                    "codec_type": "audio",
                    "codec_name": (
                        "mp3"
                        if file_path.endswith(audio_default_path)
                        else (
                            "aac"
                            if file_path.endswith(audio_m4a_path)
                            else (
                                "opus" if file_path.endswith(audio_webm_path) else "mp3"
                            )
                        )
                    ),
                    "bit_rate": (
                        "37209"
                        if file_path.endswith(audio_default_path)
                        else (
                            "20746"
                            if file_path.endswith(audio_m4a_path)
                            else (
                                None
                                if file_path.endswith(audio_webm_path)
                                else "128000"
                            )
                        )
                    ),
                    "duration_ts": (
                        None if file_path.endswith(audio_webm_path) else "61200"
                    ),
                    "sample_rate": (
                        "44100" if file_path == str(original_path) else "48000"
                    ),
                    "channels": 1 if file_path != str(original_path) else 2,
                }
            ],
        },
    )

    result = pipeline.media_to_download(
        Request(source_url),
        spider_info(),
        item=item,
    )
    assert result is not None
    assert result["path"] == audio_default_path
    assert result["status"] == "uptodate"
    assert [variant.get("path") for variant in result["variants"]] == [
        audio_default_path,
        audio_m4a_path,
        audio_webm_path,
        audio_base_path,
    ]
    assert f"{audio_base_path}.mp3" not in stat_paths
    assert stat_paths[0] == audio_default_path