Replace Scrapy image pipeline with pyvips

2026-04-08 16:39:39 +02:00 · 2026-04-08 16:39:39 +02:00 · 7316d4723f
commit 7316d4723f
parent 180677efa7
5 changed files with 235 additions and 28 deletions
--- a/repub/pipelines.py
+++ b/repub/pipelines.py
@ -1,3 +1,4 @@
+import functools
 import hashlib
 import logging
 import mimetypes
@ -8,10 +9,10 @@ from os import PathLike
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast

+import pyvips
 from scrapy.crawler import Crawler
 from scrapy.pipelines.files import FileException
 from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
-from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline

 import repub.utils
 from repub import media
@ -20,12 +21,108 @@ from repub.items import MediaVariant, TranscodedMediaFile
 logger = logging.getLogger(__name__)


-class ImagePipeline(BaseImagesPipeline):
+class ImageException(FileException):
+    """General image error exception"""
+
+
+def image_mimetype(response=None, *, url: str | None = None) -> str | None:
+    del url
+    if response is not None:
+        content_type = response.headers.get(b"Content-Type")
+        if content_type:
+            return content_type.decode("utf-8").split(";", 1)[0].strip()
+    return None
+
+
+def convert_image_body_to_jpeg(
+    body: bytes,
+    *,
+    source_mimetype: str | None = None,
+) -> tuple[BytesIO, int, int]:
+    try:
+        image = cast(
+            Any,
+            pyvips.Image.new_from_buffer(body, "", access="sequential"),
+        ).autorot()
+    except pyvips.Error as exc:
+        raise ImageException(str(exc)) from exc
+
+    width = image.width
+    height = image.height
+    loader = ""
+    if image.get_typeof("vips-loader"):
+        loader = str(image.get("vips-loader"))
+    if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
+        return BytesIO(body), width, height
+
+    if image.hasalpha():
+        image = image.flatten(background=[255, 255, 255])
+    image = image.colourspace("srgb")
+    return BytesIO(image.jpegsave_buffer()), width, height
+
+
+class ImagePipeline(BaseFilesPipeline):
+    MEDIA_NAME = "image"
+    EXPIRES = 90
+    MIN_WIDTH = 0
+    MIN_HEIGHT = 0
+    DEFAULT_FILES_URLS_FIELD = "image_urls"
+    DEFAULT_FILES_RESULT_FIELD = "images"
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        cls._update_stores(crawler.settings)
+        return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
+
+    def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
+        self.settings = crawler.settings
+        super().__init__(store_uri, crawler=crawler)
+        resolve = functools.partial(
+            self._key_for_pipe,
+            base_class_name="ImagesPipeline",
+            settings=self.settings,
+        )
+        self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
+        self.files_urls_field = self.settings.get(
+            resolve("IMAGES_URLS_FIELD"),
+            self.DEFAULT_FILES_URLS_FIELD,
+        )
+        self.files_result_field = self.settings.get(
+            resolve("IMAGES_RESULT_FIELD"),
+            self.DEFAULT_FILES_RESULT_FIELD,
+        )
+        self.min_width = self.settings.getint(
+            resolve("IMAGES_MIN_WIDTH"),
+            self.MIN_WIDTH,
+        )
+        self.min_height = self.settings.getint(
+            resolve("IMAGES_MIN_HEIGHT"),
+            self.MIN_HEIGHT,
+        )
+
    def file_path(self, request, response=None, info=None, *, item=None):
        return repub.utils.local_image_path(request.url)

-    def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
-        raise NotImplementedError()
+    def file_downloaded(self, response, request, info, *, item=None):
+        path = self.file_path(request, response=response, info=info, item=item)
+        buf, width, height = convert_image_body_to_jpeg(
+            response.body,
+            source_mimetype=image_mimetype(response, url=request.url),
+        )
+        if width < self.min_width or height < self.min_height:
+            raise ImageException(
+                "Image too small "
+                f"({width}x{height} < {self.min_width}x{self.min_height})"
+            )
+        checksum = buffer_checksum(buf)
+        self.store.persist_file(
+            path,
+            buf,
+            info,
+            meta={"width": width, "height": height},
+            headers={"Content-Type": "image/jpeg"},
+        )
+        return checksum


 class FilePipeline(BaseFilesPipeline):