Replace Scrapy image pipeline with pyvips

This commit is contained in:
Abel Luck 2026-04-08 16:39:39 +02:00
parent 180677efa7
commit 7316d4723f
5 changed files with 235 additions and 28 deletions

View file

@ -1,3 +1,4 @@
import functools
import hashlib
import logging
import mimetypes
@ -8,10 +9,10 @@ from os import PathLike
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
import pyvips
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
import repub.utils
from repub import media
@ -20,12 +21,108 @@ from repub.items import MediaVariant, TranscodedMediaFile
logger = logging.getLogger(__name__)
class ImagePipeline(BaseImagesPipeline):
class ImageException(FileException):
"""General image error exception"""
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
del url
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
return None
def convert_image_body_to_jpeg(
body: bytes,
*,
source_mimetype: str | None = None,
) -> tuple[BytesIO, int, int]:
try:
image = cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
).autorot()
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
width = image.width
height = image.height
loader = ""
if image.get_typeof("vips-loader"):
loader = str(image.get("vips-loader"))
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
return BytesIO(body), width, height
if image.hasalpha():
image = image.flatten(background=[255, 255, 255])
image = image.colourspace("srgb")
return BytesIO(image.jpegsave_buffer()), width, height
class ImagePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
MIN_HEIGHT = 0
DEFAULT_FILES_URLS_FIELD = "image_urls"
DEFAULT_FILES_RESULT_FIELD = "images"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
resolve = functools.partial(
self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=self.settings,
)
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
self.files_urls_field = self.settings.get(
resolve("IMAGES_URLS_FIELD"),
self.DEFAULT_FILES_URLS_FIELD,
)
self.files_result_field = self.settings.get(
resolve("IMAGES_RESULT_FIELD"),
self.DEFAULT_FILES_RESULT_FIELD,
)
self.min_width = self.settings.getint(
resolve("IMAGES_MIN_WIDTH"),
self.MIN_WIDTH,
)
self.min_height = self.settings.getint(
resolve("IMAGES_MIN_HEIGHT"),
self.MIN_HEIGHT,
)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
raise NotImplementedError()
def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf, width, height = convert_image_body_to_jpeg(
response.body,
source_mimetype=image_mimetype(response, url=request.url),
)
if width < self.min_width or height < self.min_height:
raise ImageException(
"Image too small "
f"({width}x{height} < {self.min_width}x{self.min_height})"
)
checksum = buffer_checksum(buf)
self.store.persist_file(
path,
buf,
info,
meta={"width": width, "height": height},
headers={"Content-Type": "image/jpeg"},
)
return checksum
class FilePipeline(BaseFilesPipeline):