Replace Scrapy image pipeline with pyvips

This commit is contained in:
Abel Luck 2026-04-08 16:39:39 +02:00
parent 180677efa7
commit 7316d4723f
5 changed files with 235 additions and 28 deletions

View file

@ -63,6 +63,12 @@
feedgen = prev.feedgen.overrideAttrs (old: { feedgen = prev.feedgen.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.setuptools ]; nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.setuptools ];
}); });
pyvips = prev.pyvips.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
final.setuptools
final.pkgconfig
];
});
pygea = prev.pygea.overrideAttrs (old: { pygea = prev.pygea.overrideAttrs (old: {
nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [
final.hatchling final.hatchling
@ -108,6 +114,7 @@
checkPhase = '' checkPhase = ''
runHook preCheck runHook preCheck
export HOME="$(mktemp -d)" export HOME="$(mktemp -d)"
export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath [ pkgs.vips ]}:$LD_LIBRARY_PATH"
pytest tests/ -v pytest tests/ -v
runHook postCheck runHook postCheck
''; '';
@ -125,7 +132,8 @@
postBuild = '' postBuild = ''
rm -f "$out/bin/repub" rm -f "$out/bin/repub"
makeWrapper "${baseVenv}/bin/repub" "$out/bin/repub" \ makeWrapper "${baseVenv}/bin/repub" "$out/bin/repub" \
--prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" --prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" \
--prefix LD_LIBRARY_PATH : "${pkgs.lib.makeLibraryPath [ pkgs.vips ]}"
''; '';
meta.mainProgram = "repub"; meta.mainProgram = "repub";
}; };
@ -273,12 +281,14 @@
packages = [ packages = [
pkgs.tailwindcss_4 pkgs.tailwindcss_4
pkgs.python313 pkgs.python313
pkgs.vips
pkgs.uv pkgs.uv
pkgs.pyright pkgs.pyright
(mkFfmpegPackage pkgs) (mkFfmpegPackage pkgs)
]; ];
env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [
pkgs.stdenv.cc.cc pkgs.stdenv.cc.cc
pkgs.vips
]; ];
env.UV_PROJECT_ENVIRONMENT = ".venv"; env.UV_PROJECT_ENVIRONMENT = ".venv";
env.UV_PYTHON_DOWNLOADS = "never"; env.UV_PYTHON_DOWNLOADS = "never";

View file

@ -12,7 +12,7 @@ dependencies = [
"colorlog>=6.8.2,<7.0.0", "colorlog>=6.8.2,<7.0.0",
"feedparser>=6.0.11,<7.0.0", "feedparser>=6.0.11,<7.0.0",
"lxml>=5.2.1,<6.0.0", "lxml>=5.2.1,<6.0.0",
"pillow>=10.3.0,<11.0.0", "pyvips>=3.0.0,<4.0.0",
"ffmpeg-python>=0.2.0,<0.3.0", "ffmpeg-python>=0.2.0,<0.3.0",
"Quart>=0.20.0,<0.21.0", "Quart>=0.20.0,<0.21.0",
"hypercorn>=0.18.0,<0.19.0", "hypercorn>=0.18.0,<0.19.0",

View file

@ -1,3 +1,4 @@
import functools
import hashlib import hashlib
import logging import logging
import mimetypes import mimetypes
@ -8,10 +9,10 @@ from os import PathLike
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast from typing import Any, Dict, List, Optional, Union, cast
import pyvips
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
import repub.utils import repub.utils
from repub import media from repub import media
@ -20,12 +21,108 @@ from repub.items import MediaVariant, TranscodedMediaFile
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ImagePipeline(BaseImagesPipeline): class ImageException(FileException):
"""General image error exception"""
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
del url
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
return None
def convert_image_body_to_jpeg(
body: bytes,
*,
source_mimetype: str | None = None,
) -> tuple[BytesIO, int, int]:
try:
image = cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
).autorot()
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
width = image.width
height = image.height
loader = ""
if image.get_typeof("vips-loader"):
loader = str(image.get("vips-loader"))
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
return BytesIO(body), width, height
if image.hasalpha():
image = image.flatten(background=[255, 255, 255])
image = image.colourspace("srgb")
return BytesIO(image.jpegsave_buffer()), width, height
class ImagePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
MIN_HEIGHT = 0
DEFAULT_FILES_URLS_FIELD = "image_urls"
DEFAULT_FILES_RESULT_FIELD = "images"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
resolve = functools.partial(
self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=self.settings,
)
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
self.files_urls_field = self.settings.get(
resolve("IMAGES_URLS_FIELD"),
self.DEFAULT_FILES_URLS_FIELD,
)
self.files_result_field = self.settings.get(
resolve("IMAGES_RESULT_FIELD"),
self.DEFAULT_FILES_RESULT_FIELD,
)
self.min_width = self.settings.getint(
resolve("IMAGES_MIN_WIDTH"),
self.MIN_WIDTH,
)
self.min_height = self.settings.getint(
resolve("IMAGES_MIN_HEIGHT"),
self.MIN_HEIGHT,
)
def file_path(self, request, response=None, info=None, *, item=None): def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url) return repub.utils.local_image_path(request.url)
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None): def file_downloaded(self, response, request, info, *, item=None):
raise NotImplementedError() path = self.file_path(request, response=response, info=info, item=item)
buf, width, height = convert_image_body_to_jpeg(
response.body,
source_mimetype=image_mimetype(response, url=request.url),
)
if width < self.min_width or height < self.min_height:
raise ImageException(
"Image too small "
f"({width}x{height} < {self.min_width}x{self.min_height})"
)
checksum = buffer_checksum(buf)
self.store.persist_file(
path,
buf,
info,
meta={"width": width, "height": height},
headers={"Content-Type": "image/jpeg"},
)
return checksum
class FilePipeline(BaseFilesPipeline): class FilePipeline(BaseFilesPipeline):

View file

@ -4,6 +4,7 @@ from types import SimpleNamespace
from typing import Any, cast from typing import Any, cast
import pytest import pytest
import pyvips
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy.http import Request, Response from scrapy.http import Request, Response
@ -16,10 +17,18 @@ from repub.config import (
build_feed_settings, build_feed_settings,
) )
from repub.items import ElementItem from repub.items import ElementItem
from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline from repub.pipelines import (
AudioPipeline,
FilePipeline,
ImagePipeline,
VideoPipeline,
convert_image_body_to_jpeg,
image_mimetype,
)
from repub.utils import ( from repub.utils import (
FileType, FileType,
local_audio_path, local_audio_path,
local_image_path,
local_video_path, local_video_path,
published_media_path, published_media_path,
) )
@ -53,6 +62,14 @@ def store_dir(pipeline: Any) -> Path:
return Path(cast(Any, pipeline.store).basedir) return Path(cast(Any, pipeline.store).basedir)
def transparent_png_bytes() -> bytes:
return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer()
def jpeg_bytes() -> bytes:
return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("pipeline_cls", "store_setting"), ("pipeline_cls", "store_setting"),
[ [
@ -630,6 +647,99 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant
assert completed_item.audios == [result] assert completed_item.audios == [result]
def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None:
converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes())
assert (width, height) == (2, 3)
assert converted.getvalue().startswith(b"\xff\xd8\xff")
image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
assert min(image.getpoint(0, 0)) >= 240
def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None:
source = jpeg_bytes()
converted, width, height = convert_image_body_to_jpeg(source)
assert (width, height) == (4, 5)
assert converted.getvalue() == source
def test_image_mimetype_does_not_guess_from_url_extension() -> None:
assert image_mimetype(url="https://example.com/photo.jpg") is None
def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images(
monkeypatch, tmp_path: Path
) -> None:
crawler = build_test_crawler(tmp_path)
pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler))
monkeypatch.setattr(pipeline, "inc_stats", lambda status: None)
persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = []
source_url = "https://example.com/photo.png"
item = ElementItem(
feed_name="nasa",
el=None,
image_urls=[source_url],
images=[],
file_urls=[],
files=[],
audio_urls=[],
audios=[],
video_urls=[],
videos=[],
)
def fake_persist_file(path, buf, info, meta=None, headers=None):
del info
persisted.append(
(
path,
buf.getvalue(),
cast(dict[str, Any] | None, meta),
None if headers is None else headers.get("Content-Type"),
)
)
monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file)
result = pipeline.media_downloaded(
Response(
url=source_url,
body=transparent_png_bytes(),
status=200,
headers={"Content-Type": "image/png"},
),
Request(source_url),
spider_info(),
item=item,
)
assert result == {
"url": source_url,
"path": local_image_path(source_url),
"checksum": result["checksum"],
"status": "downloaded",
}
assert isinstance(result["checksum"], str)
assert len(persisted) == 1
assert persisted[0][0] == local_image_path(source_url)
assert persisted[0][2] == {"width": 2, "height": 3}
assert persisted[0][3] == "image/jpeg"
image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], ""))
assert image.width == 2
assert image.height == 3
assert image.bands == 3
completed_item = pipeline.item_completed([(True, result)], item, spider_info())
assert completed_item.images == [result]
def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants( def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(
monkeypatch, tmp_path: Path monkeypatch, tmp_path: Path
) -> None: ) -> None:

32
uv.lock generated
View file

@ -812,25 +812,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" }, { url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" },
] ]
[[package]]
name = "pillow"
version = "10.4.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c3/00/706cebe7c2c12a6318aabe5d354836f54adff7156fd9e1bd6c89f4ba0e98/pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3", size = 3525685, upload-time = "2024-07-01T09:46:45.194Z" },
{ url = "https://files.pythonhosted.org/packages/cf/76/f658cbfa49405e5ecbfb9ba42d07074ad9792031267e782d409fd8fe7c69/pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb", size = 3374883, upload-time = "2024-07-01T09:46:47.331Z" },
{ url = "https://files.pythonhosted.org/packages/46/2b/99c28c4379a85e65378211971c0b430d9c7234b1ec4d59b2668f6299e011/pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70", size = 4339837, upload-time = "2024-07-01T09:46:49.647Z" },
{ url = "https://files.pythonhosted.org/packages/f1/74/b1ec314f624c0c43711fdf0d8076f82d9d802afd58f1d62c2a86878e8615/pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be", size = 4455562, upload-time = "2024-07-01T09:46:51.811Z" },
{ url = "https://files.pythonhosted.org/packages/4a/2a/4b04157cb7b9c74372fa867096a1607e6fedad93a44deeff553ccd307868/pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0", size = 4366761, upload-time = "2024-07-01T09:46:53.961Z" },
{ url = "https://files.pythonhosted.org/packages/ac/7b/8f1d815c1a6a268fe90481232c98dd0e5fa8c75e341a75f060037bd5ceae/pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc", size = 4536767, upload-time = "2024-07-01T09:46:56.664Z" },
{ url = "https://files.pythonhosted.org/packages/e5/77/05fa64d1f45d12c22c314e7b97398ffb28ef2813a485465017b7978b3ce7/pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a", size = 4477989, upload-time = "2024-07-01T09:46:58.977Z" },
{ url = "https://files.pythonhosted.org/packages/12/63/b0397cfc2caae05c3fb2f4ed1b4fc4fc878f0243510a7a6034ca59726494/pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309", size = 4610255, upload-time = "2024-07-01T09:47:01.189Z" },
{ url = "https://files.pythonhosted.org/packages/7b/f9/cfaa5082ca9bc4a6de66ffe1c12c2d90bf09c309a5f52b27759a596900e7/pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060", size = 2235603, upload-time = "2024-07-01T09:47:03.918Z" },
{ url = "https://files.pythonhosted.org/packages/01/6a/30ff0eef6e0c0e71e55ded56a38d4859bf9d3634a94a88743897b5f96936/pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea", size = 2554972, upload-time = "2024-07-01T09:47:06.152Z" },
{ url = "https://files.pythonhosted.org/packages/48/2c/2e0a52890f269435eee38b21c8218e102c621fe8d8df8b9dd06fabf879ba/pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d", size = 2243375, upload-time = "2024-07-01T09:47:09.065Z" },
]
[[package]] [[package]]
name = "platformdirs" name = "platformdirs"
version = "4.9.4" version = "4.9.4"
@ -1012,6 +993,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
] ]
[[package]]
name = "pyvips"
version = "3.1.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cffi" },
]
sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/282936de9faac6addf6bc8792c18e006489d0023ffd8856b8643f54d0558/pyvips-3.1.1.tar.gz", hash = "sha256:84fe744d023b1084ac2516bb17064cacd41c7f8aabf8e524dd383534941b9301", size = 56951, upload-time = "2025-12-09T18:38:06.355Z" }
[[package]] [[package]]
name = "pyyaml" name = "pyyaml"
version = "6.0.3" version = "6.0.3"
@ -1093,10 +1083,10 @@ dependencies = [
{ name = "hypercorn" }, { name = "hypercorn" },
{ name = "lxml" }, { name = "lxml" },
{ name = "peewee" }, { name = "peewee" },
{ name = "pillow" },
{ name = "prometheus-client" }, { name = "prometheus-client" },
{ name = "pygea" }, { name = "pygea" },
{ name = "python-dateutil" }, { name = "python-dateutil" },
{ name = "pyvips" },
{ name = "quart" }, { name = "quart" },
{ name = "scrapy" }, { name = "scrapy" },
] ]
@ -1126,10 +1116,10 @@ requires-dist = [
{ name = "hypercorn", specifier = ">=0.18.0,<0.19.0" }, { name = "hypercorn", specifier = ">=0.18.0,<0.19.0" },
{ name = "lxml", specifier = ">=5.2.1,<6.0.0" }, { name = "lxml", specifier = ">=5.2.1,<6.0.0" },
{ name = "peewee", specifier = ">=3.19.0,<4.0.0" }, { name = "peewee", specifier = ">=3.19.0,<4.0.0" },
{ name = "pillow", specifier = ">=10.3.0,<11.0.0" },
{ name = "prometheus-client", specifier = ">=0.20.0,<0.21.0" }, { name = "prometheus-client", specifier = ">=0.20.0,<0.21.0" },
{ name = "pygea", git = "https://guardianproject.dev/anynews/pygea.git" }, { name = "pygea", git = "https://guardianproject.dev/anynews/pygea.git" },
{ name = "python-dateutil", specifier = ">=2.9.0.post0,<3.0.0" }, { name = "python-dateutil", specifier = ">=2.9.0.post0,<3.0.0" },
{ name = "pyvips", specifier = ">=3.0.0,<4.0.0" },
{ name = "quart", specifier = ">=0.20.0,<0.21.0" }, { name = "quart", specifier = ">=0.20.0,<0.21.0" },
{ name = "scrapy", specifier = ">=2.11.1,<3.0.0" }, { name = "scrapy", specifier = ">=2.11.1,<3.0.0" },
] ]