Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
Abel Luck 2026-05-27 09:24:22 +02:00
parent 7316d4723f
commit 525393272e
13 changed files with 1299 additions and 124 deletions

View file

@ -20,17 +20,20 @@ from repub.items import ElementItem
from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImagePipeline,
ImageNormalizePipeline,
ImageThumbnailPipeline,
VideoPipeline,
convert_image_body_to_jpeg,
image_mimetype,
)
from repub.utils import (
FileType,
canonical_published_image_path,
local_audio_path,
local_image_path,
local_video_path,
published_image_path,
published_media_path,
source_image_path,
thumbnail_image_path,
)
@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace:
return SimpleNamespace(settings=settings, request_fingerprinter=object())
class HashableSpiderInfo:
__hash__ = object.__hash__
def __init__(self) -> None:
self.spider = SimpleNamespace()
def spider_info() -> Any:
return SimpleNamespace(spider=SimpleNamespace())
return HashableSpiderInfo()
def store_dir(pipeline: Any) -> Path:
@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def jpeg_bytes() -> bytes:
return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes:
return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer()
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
(ImageNormalizePipeline, "IMAGES_STORE"),
(AudioPipeline, "AUDIO_STORE"),
(VideoPipeline, "VIDEO_STORE"),
(FilePipeline, "FILES_STORE"),
@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert completed_item.audios == [result]
def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
assert (width, height) == (2, 3)
assert converted.getvalue().startswith(b"\xff\xd8\xff")
image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
assert min(image.getpoint(0, 0)) >= 240
def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
source = jpeg_bytes()
converted, width, height = convert_image_body_to_jpeg(source)
assert (width, height) == (4, 5)
assert converted.getvalue() == source
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
video_urls=[],
videos=[],
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info
persisted.append(
(
path,
buf.getvalue(),
cast(dict[str, Any] | None, meta),
None if headers is None else headers.get("Content-Type"),
)
)
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
canonical_path = canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
source_path = source_image_path(source_url, "image/png")
webp_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][0],
)
jpeg_path = published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"][1],
)
source_body = transparent_png_bytes()
result = pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
webp_file_size = result["variants"][0].get("fileSize")
jpeg_file_size = result["variants"][1].get("fileSize")
assert result == {
"url": source_url,
"path": canonical_path,
"published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}",
"checksum": result["checksum"],
"status": "downloaded",
"source_path": source_path,
"variants": [
{
"url": f"https://mirror.example/feeds/nasa/images/{webp_path}",
"path": webp_path,
"type": "image/webp",
"medium": "image",
"isDefault": "true",
"fileSize": webp_file_size,
"width": 2,
"height": 3,
},
{
"url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}",
"path": jpeg_path,
"type": "image/jpeg",
"medium": "image",
"isDefault": "false",
"fileSize": jpeg_file_size,
"width": 2,
"height": 3,
},
],
"thumbnails": [],
}
assert isinstance(result["checksum"], str)
assert isinstance(webp_file_size, int)
assert isinstance(jpeg_file_size, int)
assert (store_dir(pipeline) / source_path).read_bytes() == source_body
webp_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)),
)
jpeg_image = cast(
Any,
pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)),
)
assert (webp_image.width, webp_image.height) == (2, 3)
assert (jpeg_image.width, jpeg_image.height) == (2, 3)
assert jpeg_image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo.png"
source_body = png_bytes(1200, 900)
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
normalized = normalize_pipeline.media_downloaded(
Response(
url=source_url,
body=source_body,
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
item.images = [normalized]
processed = thumbnail_pipeline.process_item(item, spider_info().spider)
thumbnails = processed.images[0]["thumbnails"]
thumb_slots = [thumb.get("slot") for thumb in thumbnails]
first_thumb = thumbnails[0]
second_thumb = thumbnails[1]
assert processed.images[0]["path"] == canonical_published_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE"],
)
assert thumb_slots == ["card_hero", "list_square"]
assert first_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0],
)
assert first_thumb.get("type") == "image/jpeg"
assert first_thumb.get("width") == 640
assert first_thumb.get("height") == 360
assert second_thumb.get("path") == thumbnail_image_path(
source_url,
crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1],
)
assert second_thumb.get("width") == 160
assert second_thumb.get("height") == 160
for thumb in thumbnails:
thumb_path = thumb.get("path")
thumb_width = thumb.get("width")
thumb_height = thumb.get("height")
thumb_image = cast(
Any,
pyvips.Image.new_from_file(
str(store_dir(normalize_pipeline) / str(thumb_path))
),
)
assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height)
def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
source_url = "https://example.com/photo"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
downloaded = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images
item=item,
)
assert result == {
"url": source_url,
"path": local_image_path(source_url),
"checksum": result["checksum"],
"status": "downloaded",
}
assert isinstance(result["checksum"], str)
assert len(persisted) == 1
assert persisted[0][0] == local_image_path(source_url)
assert persisted[0][2] == {"width": 2, "height": 3}
assert persisted[0][3] == "image/jpeg"
uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item)
image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
assert downloaded["source_path"].endswith(".png")
assert uptodate is not None
assert uptodate["source_path"] == downloaded["source_path"]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(