Replace Scrapy image pipeline with pyvips

This commit is contained in:
Abel Luck 2026-04-08 16:39:39 +02:00
parent 180677efa7
commit 7316d4723f
5 changed files with 235 additions and 28 deletions

View file

@ -4,6 +4,7 @@ from types import SimpleNamespace
from typing import Any, cast
import pytest
import pyvips
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
@ -16,10 +17,18 @@ from repub.config import (
build_feed_settings,
)
from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline
from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImagePipeline,
VideoPipeline,
convert_image_body_to_jpeg,
image_mimetype,
)
from repub.utils import (
FileType,
local_audio_path,
local_image_path,
local_video_path,
published_media_path,
)
@ -53,6 +62,14 @@ def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir)
def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def jpeg_bytes() -> bytes:
return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
@pytest.mark.parametrize(
("pipeline_cls", "store_setting"),
[
@ -630,6 +647,99 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert completed_item.audios == [result]
def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
assert (width, height) == (2, 3)
assert converted.getvalue().startswith(b"\xff\xd8\xff")
image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
assert min(image.getpoint(0, 0)) >= 240
def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
source = jpeg_bytes()
converted, width, height = convert_image_body_to_jpeg(source)
assert (width, height) == (4, 5)
assert converted.getvalue() == source
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info
persisted.append(
(
path,
buf.getvalue(),
cast(dict[str, Any] | None, meta),
None if headers is None else headers.get("Content-Type"),
)
)
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
assert result == {
"url": source_url,
"path": local_image_path(source_url),
"checksum": result["checksum"],
"status": "downloaded",
}
assert isinstance(result["checksum"], str)
assert len(persisted) == 1
assert persisted[0][0] == local_image_path(source_url)
assert persisted[0][2] == {"width": 2, "height": 3}
assert persisted[0][3] == "image/jpeg"
image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path
) -> None: