diff --git a/flake.nix b/flake.nix index 2d4cda9..66ce2f4 100644 --- a/flake.nix +++ b/flake.nix @@ -63,6 +63,12 @@ feedgen = prev.feedgen.overrideAttrs (old: { nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.setuptools ]; }); + pyvips = prev.pyvips.overrideAttrs (old: { + nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ + final.setuptools + final.pkgconfig + ]; + }); pygea = prev.pygea.overrideAttrs (old: { nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.hatchling @@ -108,6 +114,7 @@ checkPhase = '' runHook preCheck export HOME="$(mktemp -d)" + export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath [ pkgs.vips ]}:$LD_LIBRARY_PATH" pytest tests/ -v runHook postCheck ''; @@ -125,7 +132,8 @@ postBuild = '' rm -f "$out/bin/repub" makeWrapper "${baseVenv}/bin/repub" "$out/bin/repub" \ - --prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" + --prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" \ + --prefix LD_LIBRARY_PATH : "${pkgs.lib.makeLibraryPath [ pkgs.vips ]}" ''; meta.mainProgram = "repub"; }; @@ -273,12 +281,14 @@ packages = [ pkgs.tailwindcss_4 pkgs.python313 + pkgs.vips pkgs.uv pkgs.pyright (mkFfmpegPackage pkgs) ]; env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ pkgs.stdenv.cc.cc + pkgs.vips ]; env.UV_PROJECT_ENVIRONMENT = ".venv"; env.UV_PYTHON_DOWNLOADS = "never"; diff --git a/pyproject.toml b/pyproject.toml index b87027b..baddc3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "colorlog>=6.8.2,<7.0.0", "feedparser>=6.0.11,<7.0.0", "lxml>=5.2.1,<6.0.0", - "pillow>=10.3.0,<11.0.0", + "pyvips>=3.0.0,<4.0.0", "ffmpeg-python>=0.2.0,<0.3.0", "Quart>=0.20.0,<0.21.0", "hypercorn>=0.18.0,<0.19.0", diff --git a/repub/pipelines.py b/repub/pipelines.py index c2b11e3..a32f527 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,3 +1,4 @@ +import functools import hashlib import logging import mimetypes @@ -8,10 +9,10 @@ from os import PathLike from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast +import pyvips from scrapy.crawler import Crawler from scrapy.pipelines.files import FileException from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline -from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline import repub.utils from repub import media @@ -20,12 +21,108 @@ from repub.items import MediaVariant, TranscodedMediaFile logger = logging.getLogger(__name__) -class ImagePipeline(BaseImagesPipeline): +class ImageException(FileException): + """General image error exception""" + + +def image_mimetype(response=None, *, url: str | None = None) -> str | None: + del url + if response is not None: + content_type = response.headers.get(b"Content-Type") + if content_type: + return content_type.decode("utf-8").split(";", 1)[0].strip() + return None + + +def convert_image_body_to_jpeg( + body: bytes, + *, + source_mimetype: str | None = None, +) -> tuple[BytesIO, int, int]: + try: + image = cast( + Any, + pyvips.Image.new_from_buffer(body, "", access="sequential"), + ).autorot() + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + + width = image.width + height = image.height + loader = "" + if image.get_typeof("vips-loader"): + loader = str(image.get("vips-loader")) + if source_mimetype == "image/jpeg" or loader.startswith("jpegload"): + return BytesIO(body), width, height + + if image.hasalpha(): + image = image.flatten(background=[255, 255, 255]) + image = image.colourspace("srgb") + return BytesIO(image.jpegsave_buffer()), width, height + + +class ImagePipeline(BaseFilesPipeline): + MEDIA_NAME = "image" + EXPIRES = 90 + MIN_WIDTH = 0 + MIN_HEIGHT = 0 + DEFAULT_FILES_URLS_FIELD = "image_urls" + DEFAULT_FILES_RESULT_FIELD = "images" + + @classmethod + def from_crawler(cls, crawler: Crawler): + cls._update_stores(crawler.settings) + return cls(crawler.settings["IMAGES_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + self.settings = crawler.settings + super().__init__(store_uri, crawler=crawler) + resolve = functools.partial( + self._key_for_pipe, + base_class_name="ImagesPipeline", + settings=self.settings, + ) + self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES) + self.files_urls_field = self.settings.get( + resolve("IMAGES_URLS_FIELD"), + self.DEFAULT_FILES_URLS_FIELD, + ) + self.files_result_field = self.settings.get( + resolve("IMAGES_RESULT_FIELD"), + self.DEFAULT_FILES_RESULT_FIELD, + ) + self.min_width = self.settings.getint( + resolve("IMAGES_MIN_WIDTH"), + self.MIN_WIDTH, + ) + self.min_height = self.settings.getint( + resolve("IMAGES_MIN_HEIGHT"), + self.MIN_HEIGHT, + ) + def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_image_path(request.url) - def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None): - raise NotImplementedError() + def file_downloaded(self, response, request, info, *, item=None): + path = self.file_path(request, response=response, info=info, item=item) + buf, width, height = convert_image_body_to_jpeg( + response.body, + source_mimetype=image_mimetype(response, url=request.url), + ) + if width < self.min_width or height < self.min_height: + raise ImageException( + "Image too small " + f"({width}x{height} < {self.min_width}x{self.min_height})" + ) + checksum = buffer_checksum(buf) + self.store.persist_file( + path, + buf, + info, + meta={"width": width, "height": height}, + headers={"Content-Type": "image/jpeg"}, + ) + return checksum class FilePipeline(BaseFilesPipeline): diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 523f9bd..0c1ec6b 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -4,6 +4,7 @@ from types import SimpleNamespace from typing import Any, cast import pytest +import pyvips from scrapy.crawler import Crawler from scrapy.http import Request, Response @@ -16,10 +17,18 @@ from repub.config import ( build_feed_settings, ) from repub.items import ElementItem -from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline +from repub.pipelines import ( + AudioPipeline, + FilePipeline, + ImagePipeline, + VideoPipeline, + convert_image_body_to_jpeg, + image_mimetype, +) from repub.utils import ( FileType, local_audio_path, + local_image_path, local_video_path, published_media_path, ) @@ -53,6 +62,14 @@ def store_dir(pipeline: Any) -> Path: return Path(cast(Any, pipeline.store).basedir) +def transparent_png_bytes() -> bytes: + return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer() + + +def jpeg_bytes() -> bytes: + return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90) + + @pytest.mark.parametrize( ("pipeline_cls", "store_setting"), [ @@ -630,6 +647,99 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert completed_item.audios == [result] +def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None: + converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes()) + + assert (width, height) == (2, 3) + assert converted.getvalue().startswith(b"\xff\xd8\xff") + + image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), "")) + assert image.width == 2 + assert image.height == 3 + assert image.bands == 3 + assert min(image.getpoint(0, 0)) >= 240 + + +def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None: + source = jpeg_bytes() + + converted, width, height = convert_image_body_to_jpeg(source) + + assert (width, height) == (4, 5) + assert converted.getvalue() == source + + +def test_image_mimetype_does_not_guess_from_url_extension() -> None: + assert image_mimetype(url="https://example.com/photo.jpg") is None + + +def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = [] + source_url = "https://example.com/photo.png" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + + def fake_persist_file(path, buf, info, meta=None, headers=None): + del info + persisted.append( + ( + path, + buf.getvalue(), + cast(dict[str, Any] | None, meta), + None if headers is None else headers.get("Content-Type"), + ) + ) + + monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file) + + result = pipeline.media_downloaded( + Response( + url=source_url, + body=transparent_png_bytes(), + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + + assert result == { + "url": source_url, + "path": local_image_path(source_url), + "checksum": result["checksum"], + "status": "downloaded", + } + assert isinstance(result["checksum"], str) + assert len(persisted) == 1 + assert persisted[0][0] == local_image_path(source_url) + assert persisted[0][2] == {"width": 2, "height": 3} + assert persisted[0][3] == "image/jpeg" + + image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], "")) + assert image.width == 2 + assert image.height == 3 + assert image.bands == 3 + + completed_item = pipeline.item_completed([(True, result)], item, spider_info()) + assert completed_item.images == [result] + + def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants( monkeypatch, tmp_path: Path ) -> None: diff --git a/uv.lock b/uv.lock index 857e52d..3a73346 100644 --- a/uv.lock +++ b/uv.lock @@ -812,25 +812,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" }, ] -[[package]] -name = "pillow" -version = "10.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/00/706cebe7c2c12a6318aabe5d354836f54adff7156fd9e1bd6c89f4ba0e98/pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3", size = 3525685, upload-time = "2024-07-01T09:46:45.194Z" }, - { url = "https://files.pythonhosted.org/packages/cf/76/f658cbfa49405e5ecbfb9ba42d07074ad9792031267e782d409fd8fe7c69/pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb", size = 3374883, upload-time = "2024-07-01T09:46:47.331Z" }, - { url = "https://files.pythonhosted.org/packages/46/2b/99c28c4379a85e65378211971c0b430d9c7234b1ec4d59b2668f6299e011/pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70", size = 4339837, upload-time = "2024-07-01T09:46:49.647Z" }, - { url = "https://files.pythonhosted.org/packages/f1/74/b1ec314f624c0c43711fdf0d8076f82d9d802afd58f1d62c2a86878e8615/pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be", size = 4455562, upload-time = "2024-07-01T09:46:51.811Z" }, - { url = "https://files.pythonhosted.org/packages/4a/2a/4b04157cb7b9c74372fa867096a1607e6fedad93a44deeff553ccd307868/pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0", size = 4366761, upload-time = "2024-07-01T09:46:53.961Z" }, - { url = "https://files.pythonhosted.org/packages/ac/7b/8f1d815c1a6a268fe90481232c98dd0e5fa8c75e341a75f060037bd5ceae/pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc", size = 4536767, upload-time = "2024-07-01T09:46:56.664Z" }, - { url = "https://files.pythonhosted.org/packages/e5/77/05fa64d1f45d12c22c314e7b97398ffb28ef2813a485465017b7978b3ce7/pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a", size = 4477989, upload-time = "2024-07-01T09:46:58.977Z" }, - { url = "https://files.pythonhosted.org/packages/12/63/b0397cfc2caae05c3fb2f4ed1b4fc4fc878f0243510a7a6034ca59726494/pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309", size = 4610255, upload-time = "2024-07-01T09:47:01.189Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f9/cfaa5082ca9bc4a6de66ffe1c12c2d90bf09c309a5f52b27759a596900e7/pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060", size = 2235603, upload-time = "2024-07-01T09:47:03.918Z" }, - { url = "https://files.pythonhosted.org/packages/01/6a/30ff0eef6e0c0e71e55ded56a38d4859bf9d3634a94a88743897b5f96936/pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea", size = 2554972, upload-time = "2024-07-01T09:47:06.152Z" }, - { url = "https://files.pythonhosted.org/packages/48/2c/2e0a52890f269435eee38b21c8218e102c621fe8d8df8b9dd06fabf879ba/pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d", size = 2243375, upload-time = "2024-07-01T09:47:09.065Z" }, -] - [[package]] name = "platformdirs" version = "4.9.4" @@ -1012,6 +993,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "pyvips" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/282936de9faac6addf6bc8792c18e006489d0023ffd8856b8643f54d0558/pyvips-3.1.1.tar.gz", hash = "sha256:84fe744d023b1084ac2516bb17064cacd41c7f8aabf8e524dd383534941b9301", size = 56951, upload-time = "2025-12-09T18:38:06.355Z" } + [[package]] name = "pyyaml" version = "6.0.3" @@ -1093,10 +1083,10 @@ dependencies = [ { name = "hypercorn" }, { name = "lxml" }, { name = "peewee" }, - { name = "pillow" }, { name = "prometheus-client" }, { name = "pygea" }, { name = "python-dateutil" }, + { name = "pyvips" }, { name = "quart" }, { name = "scrapy" }, ] @@ -1126,10 +1116,10 @@ requires-dist = [ { name = "hypercorn", specifier = ">=0.18.0,<0.19.0" }, { name = "lxml", specifier = ">=5.2.1,<6.0.0" }, { name = "peewee", specifier = ">=3.19.0,<4.0.0" }, - { name = "pillow", specifier = ">=10.3.0,<11.0.0" }, { name = "prometheus-client", specifier = ">=0.20.0,<0.21.0" }, { name = "pygea", git = "https://guardianproject.dev/anynews/pygea.git" }, { name = "python-dateutil", specifier = ">=2.9.0.post0,<3.0.0" }, + { name = "pyvips", specifier = ">=3.0.0,<4.0.0" }, { name = "quart", specifier = ">=0.20.0,<0.21.0" }, { name = "scrapy", specifier = ">=2.11.1,<3.0.0" }, ]