Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles - generate source, full-size variant, and thumbnail image artifacts - rewrite canonical image URLs through the first configured profile - emit explicit image Media RSS groups with named thumbnails - preserve legacy image paths when image conversion is disabled - cover cache-hit source paths, inline image handling, and thumbnail export
2026-05-27 09:24:22 +02:00 · 2026-05-27 09:24:22 +02:00 · 525393272e
commit 525393272e
parent 7316d4723f
13 changed files with 1299 additions and 124 deletions
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@ -20,17 +20,20 @@ from repub.items import ElementItem
 from repub.pipelines import (
    AudioPipeline,
    FilePipeline,
-    ImagePipeline,
+    ImageNormalizePipeline,
+    ImageThumbnailPipeline,
    VideoPipeline,
-    convert_image_body_to_jpeg,
    image_mimetype,
 )
 from repub.utils import (
    FileType,
+    canonical_published_image_path,
    local_audio_path,
-    local_image_path,
    local_video_path,
+    published_image_path,
    published_media_path,
+    source_image_path,
+    thumbnail_image_path,
 )


@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
    return SimpleNamespace(settings=settings, request_fingerprinter=object())


+class HashableSpiderInfo:
+    __hash__ = object.__hash__
+
+    def __init__(self) -> None:
+        self.spider = SimpleNamespace()
+
+
 def spider_info() -> Any:
-    return SimpleNamespace(spider=SimpleNamespace())
+    return HashableSpiderInfo()


 def store_dir(pipeline: Any) -> Path:
@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes:
    return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()


-def jpeg_bytes() -> bytes:
-    return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
+def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
+    return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()


@pytest.mark.parametrize(
    ("pipeline_cls", "store_setting"),
    [
+        (ImageNormalizePipeline, "IMAGES_STORE"),
        (AudioPipeline, "AUDIO_STORE"),
        (VideoPipeline, "VIDEO_STORE"),
        (FilePipeline, "FILES_STORE"),
@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
    assert completed_item.audios == [result]


-def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
-    converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
-
-    assert (width, height) == (2, 3)
-    assert converted.getvalue().startswith(b"\xff\xd8\xff")
-
-    image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
-    assert image.width == 2
-    assert image.height == 3
-    assert image.bands == 3
-    assert min(image.getpoint(0, 0)) >= 240
-
-
-def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
-    source = jpeg_bytes()
-
-    converted, width, height = convert_image_body_to_jpeg(source)
-
-    assert (width, height) == (4, 5)
-    assert converted.getvalue() == source
-
-
 def test_image_mimetype_does_not_guess_from_url_extension() -> None:
    assert image_mimetype(url="https://example.com/photo.jpg") is None


-def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
+def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
    monkeypatch, tmp_path: Path
 ) -> None:
    crawler = build_test_crawler(tmp_path)
-    pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
+    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
-    persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
    source_url = "https://example.com/photo.png"
    item = ElementItem(
        feed_name="nasa",
@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
        video_urls=[],
        videos=[],
    )
-
-    def fake_persist_file(path, buf, info, meta=None, headers=None):
-        del info
-        persisted.append(
-            (
-                path,
-                buf.getvalue(),
-                cast(dict[str, Any] | None, meta),
-                None if headers is None else headers.get("Content-Type"),
-            )
-        )
-
-    monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
+    canonical_path = canonical_published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"],
+    )
+    source_path = source_image_path(source_url, "image/png")
+    webp_path = published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"][0],
+    )
+    jpeg_path = published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"][1],
+    )
+    source_body = transparent_png_bytes()

    result = pipeline.media_downloaded(
+        Response(
+            url=source_url,
+            body=source_body,
+            status=200,
+            headers={"Content-Type": "image/png"},
+        ),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    webp_file_size = result["variants"][0].get("fileSize")
+    jpeg_file_size = result["variants"][1].get("fileSize")
+
+    assert result == {
+        "url": source_url,
+        "path": canonical_path,
+        "published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
+        "checksum": result["checksum"],
+        "status": "downloaded",
+        "source_path": source_path,
+        "variants": [
+            {
+                "url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
+                "path": webp_path,
+                "type": "image/webp",
+                "medium": "image",
+                "isDefault": "true",
+                "fileSize": webp_file_size,
+                "width": 2,
+                "height": 3,
+            },
+            {
+                "url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
+                "path": jpeg_path,
+                "type": "image/jpeg",
+                "medium": "image",
+                "isDefault": "false",
+                "fileSize": jpeg_file_size,
+                "width": 2,
+                "height": 3,
+            },
+        ],
+        "thumbnails": [],
+    }
+    assert isinstance(result["checksum"], str)
+    assert isinstance(webp_file_size, int)
+    assert isinstance(jpeg_file_size, int)
+    assert (store_dir(pipeline) / source_path).read_bytes() == source_body
+    webp_image = cast(
+        Any,
+        pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
+    )
+    jpeg_image = cast(
+        Any,
+        pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
+    )
+    assert (webp_image.width, webp_image.height) == (2, 3)
+    assert (jpeg_image.width, jpeg_image.height) == (2, 3)
+    assert jpeg_image.bands == 3
+
+    completed_item = pipeline.item_completed([(True, result)], item, spider_info())
+    assert completed_item.images == [result]
+
+
+def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
+    thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/photo.png"
+    source_body = png_bytes(1200, 900)
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[source_url],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    normalized = normalize_pipeline.media_downloaded(
+        Response(
+            url=source_url,
+            body=source_body,
+            status=200,
+            headers={"Content-Type": "image/png"},
+        ),
+        Request(source_url),
+        spider_info(),
+        item=item,
+    )
+    item.images = [normalized]
+
+    processed = thumbnail_pipeline.process_item(item, spider_info().spider)
+    thumbnails = processed.images[0]["thumbnails"]
+    thumb_slots = [thumb.get("slot") for thumb in thumbnails]
+    first_thumb = thumbnails[0]
+    second_thumb = thumbnails[1]
+
+    assert processed.images[0]["path"] == canonical_published_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE"],
+    )
+    assert thumb_slots == ["card_hero", "list_square"]
+    assert first_thumb.get("path") == thumbnail_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
+    )
+    assert first_thumb.get("type") == "image/jpeg"
+    assert first_thumb.get("width") == 640
+    assert first_thumb.get("height") == 360
+    assert second_thumb.get("path") == thumbnail_image_path(
+        source_url,
+        crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
+    )
+    assert second_thumb.get("width") == 160
+    assert second_thumb.get("height") == 160
+    for thumb in thumbnails:
+        thumb_path = thumb.get("path")
+        thumb_width = thumb.get("width")
+        thumb_height = thumb.get("height")
+        thumb_image = cast(
+            Any,
+            pyvips.Image.new_from_file(
+                str(store_dir(normalize_pipeline) / str(thumb_path))
+            ),
+        )
+        assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
+
+
+def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
+    monkeypatch, tmp_path: Path
+) -> None:
+    crawler = build_test_crawler(tmp_path)
+    pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
+    monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
+    source_url = "https://example.com/photo"
+    item = ElementItem(
+        feed_name="nasa",
+        el=None,
+        image_urls=[source_url],
+        images=[],
+        file_urls=[],
+        files=[],
+        audio_urls=[],
+        audios=[],
+        video_urls=[],
+        videos=[],
+    )
+
+    downloaded = pipeline.media_downloaded(
        Response(
            url=source_url,
            body=transparent_png_bytes(),
@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
        item=item,
    )

-    assert result == {
-        "url": source_url,
-        "path": local_image_path(source_url),
-        "checksum": result["checksum"],
-        "status": "downloaded",
-    }
-    assert isinstance(result["checksum"], str)
-    assert len(persisted) == 1
-    assert persisted[0][0] == local_image_path(source_url)
-    assert persisted[0][2] == {"width": 2, "height": 3}
-    assert persisted[0][3] == "image/jpeg"
+    uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)

-    image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
-    assert image.width == 2
-    assert image.height == 3
-    assert image.bands == 3
-
-    completed_item = pipeline.item_completed([(True, result)], item, spider_info())
-    assert completed_item.images == [result]
+    assert downloaded["source_path"].endswith(".png")
+    assert uptodate is not None
+    assert uptodate["source_path"] == downloaded["source_path"]


 def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(