Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles - generate source, full-size variant, and thumbnail image artifacts - rewrite canonical image URLs through the first configured profile - emit explicit image Media RSS groups with named thumbnails - preserve legacy image paths when image conversion is disabled - cover cache-hit source paths, inline image handling, and thumbnail export
2026-05-27 09:24:22 +02:00 · 2026-05-27 09:24:22 +02:00 · 525393272e
commit 525393272e
parent 7316d4723f
13 changed files with 1299 additions and 124 deletions
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion(
        convert_video=False,
    )

-    assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"]
+    assert (
+        "repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"]
+    )
+    assert (
+        "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
+    )
    assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"]
-    assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2
-    assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4
+    assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False
+    assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False
+    assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3
+    assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5
+
+
+def test_build_feed_settings_respects_image_pipeline_feature_flags(
+    tmp_path: Path,
+) -> None:
+    out_dir = (tmp_path / "mirror").resolve()
+    config = RepublisherConfig(
+        config_path=tmp_path / "repub.toml",
+        out_dir=out_dir,
+        feeds=(
+            FeedConfig(
+                name="Guardian Project Podcast",
+                slug="gp-pod",
+                url="https://guardianproject.info/podcast/podcast.xml",
+            ),
+        ),
+        scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False},
+    )
+
+    base_settings = build_base_settings(config)
+    feed_settings = build_feed_settings(
+        base_settings,
+        out_dir=out_dir,
+        feed_slug="gp-pod",
+    )
+
+    assert (
+        feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1
+    )
+    assert (
+        "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"]
+    )
--- a/tests/test_feed_validation.py
+++ b/tests/test_feed_validation.py
@ -16,10 +16,12 @@ from repub.rss import nsmap
 from repub.spiders.rss_spider import RssFeedSpider
 from repub.utils import (
    FileType,
+    canonical_published_image_path,
    local_audio_path,
-    local_image_path,
    local_video_path,
+    published_image_path,
    published_media_path,
+    thumbnail_image_path,
 )

 RSS_DATE_PATTERN = re.compile(
@ -44,6 +46,7 @@ def _serialize_feed(
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
            "REPUBLISHER_FEED_URL": feed_url,
@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    source_video = "https://source.example/media/video.mp4"
    channel_image = "https://source.example/media/channel.png"
    item_image = "https://source.example/media/cover.jpg"
+    image_main_path = published_image_path(
+        source_image,
+        repub_settings.REPUBLISHER_IMAGE[0],
+    )
+    image_fallback_path = published_image_path(
+        source_image,
+        repub_settings.REPUBLISHER_IMAGE[1],
+    )
+    image_thumbnail_path = thumbnail_image_path(
+        source_image,
+        repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
+    )
    audio_base_path = local_audio_path(source_audio)
    audio_default_path = published_media_path(
        FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0]
@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    )

    def prepare_item(item: ElementItem) -> None:
+        item.images = [
+            {
+                "url": source_image,
+                "path": image_main_path,
+                "published_url": _published_url(
+                    "https://mirror.example",
+                    f"images/{image_main_path}",
+                ),
+                "checksum": "image-default",
+                "status": "downloaded",
+                "source_path": "source/ignored.png",
+                "variants": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"images/{image_main_path}",
+                        ),
+                        "path": image_main_path,
+                        "type": "image/webp",
+                        "medium": "image",
+                        "isDefault": "true",
+                        "fileSize": "2345",
+                        "width": "1200",
+                        "height": "675",
+                    },
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"images/{image_fallback_path}",
+                        ),
+                        "path": image_fallback_path,
+                        "type": "image/jpeg",
+                        "medium": "image",
+                        "isDefault": "false",
+                        "fileSize": "3456",
+                        "width": "1200",
+                        "height": "675",
+                    },
+                ],
+                "thumbnails": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"images/{image_thumbnail_path}",
+                        ),
+                        "path": image_thumbnail_path,
+                        "slot": "card_hero",
+                        "type": "image/jpeg",
+                        "width": "640",
+                        "height": "360",
+                    }
+                ],
+            }
+        ]
        item.audios = [
            {
                "url": source_audio,
@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
      <enclosure url="{source_audio}" length="123" type="audio/mpeg" />
      <content:encoded><![CDATA[<div mode="body" querystring="x=1"><img src="{source_image}" contenteditable="true"></div>]]></content:encoded>
+      <media:content url="{source_image}" type="image/jpeg" medium="image" expression="full" lang="en" />
      <media:content url="{source_video}" type="video/mp4" medium="video" expression="full" duration="60" width="640" height="360" lang="en" />
      <itunes:summary><![CDATA[{long_summary}]]></itunes:summary>
      <itunes:image href="{item_image}" />
@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    assert last_build_date == item_pub_date
    assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false"
    assert channel.findtext("./image/url") == (
-        f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}"
+        "https://mirror.example/feeds/demo/images/"
+        + canonical_published_image_path(
+            channel_image,
+            repub_settings.REPUBLISHER_IMAGE,
+        )
    )

    atom_self = channel.find("atom:link", namespaces=nsmap)
@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    assert root.find("./channel/item/media:content", namespaces=nsmap) is None

    media_groups = root.findall("./channel/item/media:group", namespaces=nsmap)
-    assert len(media_groups) == 2
+    assert len(media_groups) == 3
+
+    image_group = next(
+        group
+        for group in media_groups
+        if group.find("media:thumbnail", namespaces=nsmap) is not None
+    )
+    audio_group = next(
+        group
+        for group in media_groups
+        if group.findall("media:content", namespaces=nsmap)
+        and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio"
+    )
+    video_group = next(
+        group
+        for group in media_groups
+        if group.findall("media:content", namespaces=nsmap)
+        and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video"
+    )
+
+    image_variants = image_group.findall("media:content", namespaces=nsmap)
+    assert [variant.attrib for variant in image_variants] == [
+        {
+            "url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"),
+            "type": "image/webp",
+            "medium": "image",
+            "isDefault": "true",
+            "expression": "full",
+            "lang": "en",
+            "height": "675",
+            "width": "1200",
+            "fileSize": "2345",
+        },
+        {
+            "url": (
+                f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}"
+            ),
+            "type": "image/jpeg",
+            "medium": "image",
+            "isDefault": "false",
+            "expression": "full",
+            "lang": "en",
+            "height": "675",
+            "width": "1200",
+            "fileSize": "3456",
+        },
+    ]
+    thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap)
+    assert len(thumbnails) == 1
+    assert thumbnails[0].attrib == {
+        "url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"),
+        "width": "640",
+        "height": "360",
+        f"{{{nsmap['anynews']}}}slot": "card_hero",
+        f"{{{nsmap['anynews']}}}type": "image/jpeg",
+    }

-    audio_group, video_group = media_groups
    audio_variants = audio_group.findall("media:content", namespaces=nsmap)
    assert [variant.attrib for variant in audio_variants] == [
        {
@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None:
    itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap)
    assert itunes_image is not None
    assert itunes_image.attrib == {
-        "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}"
+        "href": (
+            "https://mirror.example/feeds/demo/images/"
+            + canonical_published_image_path(
+                item_image,
+                repub_settings.REPUBLISHER_IMAGE,
+            )
+        )
    }

    itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap)
@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None:
    assert both_present.findtext("content:encoded", namespaces=nsmap) == (
        "<div>Full body</div>"
    )
+
+
+def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None:
+    source_image = "https://source.example/media/inline.jpg"
+
+    def prepare_item(item: ElementItem) -> None:
+        item.images = [
+            {
+                "url": source_image,
+                "path": published_image_path(
+                    source_image,
+                    repub_settings.REPUBLISHER_IMAGE[0],
+                ),
+                "published_url": _published_url(
+                    "https://mirror.example",
+                    "images/"
+                    + published_image_path(
+                        source_image,
+                        repub_settings.REPUBLISHER_IMAGE[0],
+                    ),
+                ),
+                "checksum": "inline-image",
+                "status": "downloaded",
+                "source_path": "source/inline.jpg",
+                "variants": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            "images/"
+                            + published_image_path(
+                                source_image,
+                                repub_settings.REPUBLISHER_IMAGE[0],
+                            ),
+                        ),
+                        "path": published_image_path(
+                            source_image,
+                            repub_settings.REPUBLISHER_IMAGE[0],
+                        ),
+                        "type": "image/webp",
+                        "medium": "image",
+                        "isDefault": "true",
+                        "width": "1200",
+                        "height": "675",
+                        "fileSize": "2345",
+                    }
+                ],
+                "thumbnails": [],
+            }
+        ]
+
+    _, root = _serialize_feed(
+        feed_url="https://mirror.example",
+        prepare_item=prepare_item,
+        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:content="http://purl.org/rss/1.0/modules/content/">
+  <channel>
+    <title>Demo Feed</title>
+    <link>https://source.example/feed</link>
+    <description>Demo description</description>
+    <item>
+      <title>Inline Image Only</title>
+      <link>https://source.example/inline</link>
+      <guid isPermaLink="false">inline-only</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <content:encoded><![CDATA[<div><img src="{source_image}"></div>]]></content:encoded>
+    </item>
+  </channel>
+</rss>
+""",
+    )
+
+    assert root.findall("./channel/item/media:group", namespaces=nsmap) == []
+
+
+def test_exporter_replaces_standalone_source_media_thumbnails() -> None:
+    source_image = "https://source.example/media/photo.jpg"
+    image_main_path = published_image_path(
+        source_image,
+        repub_settings.REPUBLISHER_IMAGE[0],
+    )
+    image_thumbnail_path = thumbnail_image_path(
+        source_image,
+        repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0],
+    )
+
+    def prepare_item(item: ElementItem) -> None:
+        item.images = [
+            {
+                "url": source_image,
+                "path": image_main_path,
+                "published_url": _published_url(
+                    "https://mirror.example",
+                    f"images/{image_main_path}",
+                ),
+                "checksum": "image-default",
+                "status": "downloaded",
+                "source_path": "source/ignored.png",
+                "variants": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"images/{image_main_path}",
+                        ),
+                        "path": image_main_path,
+                        "type": "image/webp",
+                        "medium": "image",
+                        "isDefault": "true",
+                        "fileSize": "2345",
+                        "width": "1200",
+                        "height": "675",
+                    }
+                ],
+                "thumbnails": [
+                    {
+                        "url": _published_url(
+                            "https://mirror.example",
+                            f"images/{image_thumbnail_path}",
+                        ),
+                        "path": image_thumbnail_path,
+                        "slot": "card_hero",
+                        "type": "image/jpeg",
+                        "width": "640",
+                        "height": "360",
+                    }
+                ],
+            }
+        ]
+
+    _, root = _serialize_feed(
+        feed_url="https://mirror.example",
+        prepare_item=prepare_item,
+        feed_text=f"""<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+     xmlns:media="http://search.yahoo.com/mrss/">
+  <channel>
+    <title>Demo Feed</title>
+    <link>https://source.example/feed</link>
+    <description>Demo description</description>
+    <item>
+      <title>Entry One</title>
+      <link>https://source.example/entry-1</link>
+      <guid isPermaLink="false">entry-1</guid>
+      <pubDate>Tue, 31 Mar 2026 10:31:50 +0000</pubDate>
+      <media:content url="{source_image}" type="image/jpeg" medium="image" />
+      <media:thumbnail url="https://source.example/media/source-thumb.jpg" width="10" height="10" />
+    </item>
+  </channel>
+</rss>
+""",
+    )
+
+    thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap)
+    assert thumbnails == []
+    group_thumbnails = root.findall(
+        "./channel/item/media:group/media:thumbnail",
+        namespaces=nsmap,
+    )
+    assert len(group_thumbnails) == 1
+    assert group_thumbnails[0].get("url") == (
+        f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}"
+    )
--- a/tests/test_file_feeds.py
+++ b/tests/test_file_feeds.py
@ -8,10 +8,13 @@ from repub import settings as repub_settings
 from repub.spiders.rss_spider import RssFeedSpider
 from repub.utils import (
    FileType,
+    canonical_published_image_path,
    local_audio_path,
    local_image_path,
    local_video_path,
+    published_image_path,
    published_media_path,
+    thumbnail_image_path,
 )


@ -57,14 +60,17 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
        }
    )

-    assert (
-        spider.rewrite_image_url("https://example.com/media/photo.jpg")
-        == f"images/{local_image_path('https://example.com/media/photo.jpg')}"
+    assert spider.rewrite_image_url(
+        "https://example.com/media/photo.jpg"
+    ) == "images/" + canonical_published_image_path(
+        "https://example.com/media/photo.jpg",
+        repub_settings.REPUBLISHER_IMAGE,
    )
    assert spider.rewrite_file_url(
        FileType.AUDIO,
@ -90,6 +96,28 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None:
    )


+def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> (
+    None
+):
+    spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss")
+    spider.settings = Settings(
+        values={
+            "REPUBLISHER_IMAGE_DIR": "images",
+            "REPUBLISHER_FILE_DIR": "files",
+            "REPUBLISHER_AUDIO_DIR": "audio",
+            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False,
+            "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
+            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
+            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
+        }
+    )
+
+    assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == (
+        f"images/{local_image_path('https://example.com/media/photo.jpg')}"
+    )
+
+
 def test_published_media_path_changes_when_profile_args_change() -> None:
    source_url = "https://example.com/media/clip.mp4"
    audio_profile = repub_settings.REPUBLISHER_AUDIO[0]
@ -113,6 +141,41 @@ def test_published_media_path_changes_when_profile_args_change() -> None:
    ) != published_media_path(FileType.VIDEO, source_url, base_profile)


+def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None:
+    source_url = "https://example.com/media/photo.png"
+    base_image_profile = repub_settings.REPUBLISHER_IMAGE[0]
+    base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0]
+
+    assert canonical_published_image_path(
+        source_url,
+        repub_settings.REPUBLISHER_IMAGE,
+    ) == published_image_path(source_url, base_image_profile)
+
+    changed_image_profile = {
+        **base_image_profile,
+        "transform_kwargs": {
+            **base_image_profile["transform_kwargs"],
+            "width": 2048,
+        },
+    }
+    assert published_image_path(
+        source_url,
+        changed_image_profile,
+    ) != published_image_path(source_url, base_image_profile)
+
+    changed_thumbnail_profile = {
+        **base_thumbnail_profile,
+        "save_kwargs": {
+            **base_thumbnail_profile["save_kwargs"],
+            "Q": 60,
+        },
+    }
+    assert thumbnail_image_path(
+        source_url,
+        changed_thumbnail_profile,
+    ) != thumbnail_image_path(source_url, base_thumbnail_profile)
+
+
 def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
    feed_text = """<?xml version="1.0" encoding="UTF-8"?>
 <rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
@ -138,6 +201,7 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None:
            "REPUBLISHER_FILE_DIR": "files",
            "REPUBLISHER_AUDIO_DIR": "audio",
            "REPUBLISHER_VIDEO_DIR": "video",
+            "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE,
            "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO,
            "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO,
        }
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -20,17 +20,20 @@ from repub.items import ElementItem
 from repub.pipelines import (
    AudioPipeline,
    FilePipeline,
-    ImagePipeline,
+    ImageNormalizePipeline,
+    ImageThumbnailPipeline,
    VideoPipeline,
-    convert_image_body_to_jpeg,
    image_mimetype,
 )
 from repub.utils import (
    FileType,
+    canonical_published_image_path,
    local_audio_path,
-    local_image_path,
    local_video_path,
+    published_image_path,
    published_media_path,
+    source_image_path,
+    thumbnail_image_path,
 )


@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
    return SimpleNamespace(settings=settings, request_fingerprinter=object())


+class HashableSpiderInfo:
+    __hash__ = object.__hash__
+
+    def __init__(self) -> None:
+        self.spider = SimpleNamespace()
+
+
 def spider_info() -> Any:
-    return SimpleNamespace(spider=SimpleNamespace())
+    return HashableSpiderInfo()


 def store_dir(pipeline: Any) -> Path:
@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes:
    return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()


-def jpeg_bytes() -> bytes:
-    return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
+def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
+    return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()


@pytest.mark.parametrize(
    ("pipeline_cls", "store_setting"),
    [
+        (ImageNormalizePipeline, "IMAGES_STORE"),
        (AudioPipeline, "AUDIO_STORE"),
        (VideoPipeline, "VIDEO_STORE"),
        (FilePipeline, "FILES_STORE"),
@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
    assert completed_item.audios == [result]


-def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
-    converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
-
-    assert (width, height) == (2, 3)
-    assert converted.getvalue().startswith(b"\xff\xd8\xff")
-
-    image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
-    assert image.width == 2
-    assert image.height == 3
-    assert image.bands == 3
-    assert min(image.getpoint(0, 0)) >= 240
-
-
-def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
-    source = jpeg_bytes()
-
-    converted, width, height = convert_image_body_to_jpeg(source)
-
-    assert (width, height) == (4, 5)
-    assert converted.getvalue() == source
-
-
 def test_image_mimetype_does_not_guess_from_url_extension() -> None:
    assert image_mimetype(url="https://example.com/photo.jpg") is None


-def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
+def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
    monkeypatch, tmp_path: Path
 ) -> None:
    crawler = build_test_crawler(tmp_path)
-    pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
+    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
-    persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
    source_url = "https://example.com/photo.png"
    item = ElementItem(
        feed_name="nasa",
@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
        video_urls=[],
        videos=[],
    )
-
-    def fake_persist_file(path, buf, info, meta=None, headers=None):
-        del info
-        persisted.append(
-            (
-                path,
-                buf.getvalue(),
-                cast(dict[str, Any] | None, meta),
-                None if headers is None else headers.get("Content-Type"),
-            )
-        )
-
-    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+    canonical_path = canonical_published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"],
+    )
+    source_path = source_image_path(source_url, "image/png")
+    webp_path = published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"][0],
+    )
+    jpeg_path = published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"][1],
+    )
+    source_body = transparent_png_bytes()

    result = pipeline.media_downloaded(
+        Response(
+            url=source_url,
+            body=source_body,
+            status=200,
+            headers={"Content-Type": "image/png"},
+        ),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    webp_file_size = result["variants"][0].get("fileSize")
+    jpeg_file_size = result["variants"][1].get("fileSize")
+
+    assert result == {
+        "url": source_url,
+        "path": canonical_path,
+        "published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "source_path": source_path,
+        "variants": [
+            {
+                "url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
+                "path": webp_path,
+                "type": "image/webp",
+                "medium": "image",
+                "isDefault": "true",
+                "fileSize": webp_file_size,
+                "width": 2,
+                "height": 3,
+            },
+            {
+                "url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
+                "path": jpeg_path,
+                "type": "image/jpeg",
+                "medium": "image",
+                "isDefault": "false",
+                "fileSize": jpeg_file_size,
+                "width": 2,
+                "height": 3,
+            },
+        ],
+        "thumbnails": [],
+    }
+    assert isinstance(result["checksum"], str)
+    assert isinstance(webp_file_size, int)
+    assert isinstance(jpeg_file_size, int)
+    assert (store_dir(pipeline) / source_path).read_bytes() == source_body
+    webp_image = cast(
+        Any,
+        pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
+    )
+    jpeg_image = cast(
+        Any,
+        pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
+    )
+    assert (webp_image.width, webp_image.height) == (2, 3)
+    assert (jpeg_image.width, jpeg_image.height) == (2, 3)
+    assert jpeg_image.bands == 3
+
+    completed_item = pipeline.item_completed([(True, result)], item, spider_info())
+    assert completed_item.images == [result]
+
+
+def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
+    thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/photo.png"
+    source_body = png_bytes(1200, 900)
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[source_url],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    normalized = normalize_pipeline.media_downloaded(
+        Response(
+            url=source_url,
+            body=source_body,
+            status=200,
+            headers={"Content-Type": "image/png"},
+        ),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    item.images = [normalized]
+
+    processed = thumbnail_pipeline.process_item(item, spider_info().spider)
+    thumbnails = processed.images[0]["thumbnails"]
+    thumb_slots = [thumb.get("slot") for thumb in thumbnails]
+    first_thumb = thumbnails[0]
+    second_thumb = thumbnails[1]
+
+    assert processed.images[0]["path"] == canonical_published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"],
+    )
+    assert thumb_slots == ["card_hero", "list_square"]
+    assert first_thumb.get("path") == thumbnail_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
+    )
+    assert first_thumb.get("type") == "image/jpeg"
+    assert first_thumb.get("width") == 640
+    assert first_thumb.get("height") == 360
+    assert second_thumb.get("path") == thumbnail_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
+    )
+    assert second_thumb.get("width") == 160
+    assert second_thumb.get("height") == 160
+    for thumb in thumbnails:
+        thumb_path = thumb.get("path")
+        thumb_width = thumb.get("width")
+        thumb_height = thumb.get("height")
+        thumb_image = cast(
+            Any,
+            pyvips.Image.new_from_file(
+                str(store_dir(normalize_pipeline) / str(thumb_path))
+            ),
+        )
+        assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
+
+
+def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/photo"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[source_url],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    downloaded = pipeline.media_downloaded(
        Response(
            url=source_url,
            body=transparent_png_bytes(),
@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
        item=item,
    )

-    assert result == {
-        "url": source_url,
-        "path": local_image_path(source_url),
-        "checksum": result["checksum"],
-        "status": "downloaded",
-    }
-    assert isinstance(result["checksum"], str)
-    assert len(persisted) == 1
-    assert persisted[0][0] == local_image_path(source_url)
-    assert persisted[0][2] == {"width": 2, "height": 3}
-    assert persisted[0][3] == "image/jpeg"
+    uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)

-    image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
-    assert image.width == 2
-    assert image.height == 3
-    assert image.bands == 3
-
-    completed_item = pipeline.item_completed([(True, result)], item, spider_info())
-    assert completed_item.images == [result]
+    assert downloaded["source_path"].endswith(".png")
+    assert uptodate is not None
+    assert uptodate["source_path"] == downloaded["source_path"]


 def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(