Replace Scrapy image pipeline with pyvips
This commit is contained in:
parent
180677efa7
commit
7316d4723f
5 changed files with 235 additions and 28 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
|
|
@ -8,10 +9,10 @@ from os import PathLike
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union, cast
|
||||
|
||||
import pyvips
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.pipelines.files import FileException
|
||||
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
|
||||
from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline
|
||||
|
||||
import repub.utils
|
||||
from repub import media
|
||||
|
|
@ -20,12 +21,108 @@ from repub.items import MediaVariant, TranscodedMediaFile
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ImagePipeline(BaseImagesPipeline):
|
||||
class ImageException(FileException):
|
||||
"""General image error exception"""
|
||||
|
||||
|
||||
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
|
||||
del url
|
||||
if response is not None:
|
||||
content_type = response.headers.get(b"Content-Type")
|
||||
if content_type:
|
||||
return content_type.decode("utf-8").split(";", 1)[0].strip()
|
||||
return None
|
||||
|
||||
|
||||
def convert_image_body_to_jpeg(
|
||||
body: bytes,
|
||||
*,
|
||||
source_mimetype: str | None = None,
|
||||
) -> tuple[BytesIO, int, int]:
|
||||
try:
|
||||
image = cast(
|
||||
Any,
|
||||
pyvips.Image.new_from_buffer(body, "", access="sequential"),
|
||||
).autorot()
|
||||
except pyvips.Error as exc:
|
||||
raise ImageException(str(exc)) from exc
|
||||
|
||||
width = image.width
|
||||
height = image.height
|
||||
loader = ""
|
||||
if image.get_typeof("vips-loader"):
|
||||
loader = str(image.get("vips-loader"))
|
||||
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
|
||||
return BytesIO(body), width, height
|
||||
|
||||
if image.hasalpha():
|
||||
image = image.flatten(background=[255, 255, 255])
|
||||
image = image.colourspace("srgb")
|
||||
return BytesIO(image.jpegsave_buffer()), width, height
|
||||
|
||||
|
||||
class ImagePipeline(BaseFilesPipeline):
|
||||
MEDIA_NAME = "image"
|
||||
EXPIRES = 90
|
||||
MIN_WIDTH = 0
|
||||
MIN_HEIGHT = 0
|
||||
DEFAULT_FILES_URLS_FIELD = "image_urls"
|
||||
DEFAULT_FILES_RESULT_FIELD = "images"
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler):
|
||||
cls._update_stores(crawler.settings)
|
||||
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
|
||||
|
||||
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
|
||||
self.settings = crawler.settings
|
||||
super().__init__(store_uri, crawler=crawler)
|
||||
resolve = functools.partial(
|
||||
self._key_for_pipe,
|
||||
base_class_name="ImagesPipeline",
|
||||
settings=self.settings,
|
||||
)
|
||||
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
|
||||
self.files_urls_field = self.settings.get(
|
||||
resolve("IMAGES_URLS_FIELD"),
|
||||
self.DEFAULT_FILES_URLS_FIELD,
|
||||
)
|
||||
self.files_result_field = self.settings.get(
|
||||
resolve("IMAGES_RESULT_FIELD"),
|
||||
self.DEFAULT_FILES_RESULT_FIELD,
|
||||
)
|
||||
self.min_width = self.settings.getint(
|
||||
resolve("IMAGES_MIN_WIDTH"),
|
||||
self.MIN_WIDTH,
|
||||
)
|
||||
self.min_height = self.settings.getint(
|
||||
resolve("IMAGES_MIN_HEIGHT"),
|
||||
self.MIN_HEIGHT,
|
||||
)
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
return repub.utils.local_image_path(request.url)
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
|
||||
raise NotImplementedError()
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
buf, width, height = convert_image_body_to_jpeg(
|
||||
response.body,
|
||||
source_mimetype=image_mimetype(response, url=request.url),
|
||||
)
|
||||
if width < self.min_width or height < self.min_height:
|
||||
raise ImageException(
|
||||
"Image too small "
|
||||
f"({width}x{height} < {self.min_width}x{self.min_height})"
|
||||
)
|
||||
checksum = buffer_checksum(buf)
|
||||
self.store.persist_file(
|
||||
path,
|
||||
buf,
|
||||
info,
|
||||
meta={"width": width, "height": height},
|
||||
headers={"Content-Type": "image/jpeg"},
|
||||
)
|
||||
return checksum
|
||||
|
||||
|
||||
class FilePipeline(BaseFilesPipeline):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue