republisher/repub/pipelines.py
Abel Luck 525393272e Replace image pipeline with profile-driven variants
- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
2026-05-27 09:24:22 +02:00

825 lines
29 KiB
Python

import functools
import hashlib
import logging
import mimetypes
import tempfile
import time
from io import BytesIO
from os import PathLike
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast
import pyvips
from scrapy.crawler import Crawler
from scrapy.pipelines.files import FileException
from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
import repub.utils
from repub import media
from repub.items import (
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
logger = logging.getLogger(__name__)
class ImageException(FileException):
"""General image error exception"""
def image_mimetype(response=None, *, url: str | None = None) -> str | None:
del url
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
return None
def image_loader_name(image: Any) -> str:
if image.get_typeof("vips-loader"):
return str(image.get("vips-loader"))
return ""
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
known = {
"jpegload": "image/jpeg",
"pngload": "image/png",
"gifload": "image/gif",
"svgload": "image/svg+xml",
"tiffload": "image/tiff",
"webpload": "image/webp",
"heifload": "image/heif",
"jxlload": "image/jxl",
}
for prefix, mimetype in known.items():
if loader.startswith(prefix):
return mimetype
return fallback
def load_image_from_buffer(body: bytes) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def load_image_from_file(file_path: str | Path) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_file(str(file_path), access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
transform = str(profile["transform"])
transform_kwargs = dict(profile.get("transform_kwargs", {}))
width = int(transform_kwargs.pop("width"))
if transform == "thumbnail":
image = cast(
Any,
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
)
elif transform == "thumbnail_buffer":
image = cast(
Any,
pyvips.Image.thumbnail_buffer(
Path(source_path).read_bytes(),
width,
**transform_kwargs,
),
)
else:
raise ImageException(f"Unsupported image transform: {transform}")
image = image.colourspace("srgb")
if image.hasalpha() and (
profile["mimetype"] == "image/jpeg"
or "background" in profile.get("save_kwargs", {})
):
image = image.flatten(
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
)
save_name = str(profile["save"])
try:
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
return BytesIO(cast(bytes, image_bytes))
def image_buffer_meta(
body: bytes,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_buffer(body)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, len(body), mimetype
def image_variant_meta(
file_path: str | Path,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_file(file_path)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, Path(file_path).stat().st_size, mimetype
class ImageNormalizePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
MIN_HEIGHT = 0
DEFAULT_FILES_URLS_FIELD = "image_urls"
DEFAULT_FILES_RESULT_FIELD = "images"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
resolve = functools.partial(
self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=self.settings,
)
self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
self.files_urls_field = self.settings.get(
resolve("IMAGES_URLS_FIELD"),
self.DEFAULT_FILES_URLS_FIELD,
)
self.files_result_field = self.settings.get(
resolve("IMAGES_RESULT_FIELD"),
self.DEFAULT_FILES_RESULT_FIELD,
)
self.min_width = self.settings.getint(
resolve("IMAGES_MIN_WIDTH"),
self.MIN_WIDTH,
)
self.min_height = self.settings.getint(
resolve("IMAGES_MIN_HEIGHT"),
self.MIN_HEIGHT,
)
def get_image_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE"])
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_image_path(
request.url,
self.get_image_settings(),
)
def source_path(self, request, response=None) -> str:
return repub.utils.source_image_path(
request.url,
image_mimetype(response, url=request.url),
)
def resolve_source_path(self, request, response=None) -> str:
source_path = self.source_path(request, response)
if response is not None:
return source_path
source_file = self.local_store_path(source_path)
if source_file.exists():
return source_path
source_dir = self.local_store_path(
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
)
guid = repub.utils.image_guid(request.url)
matches = sorted(source_dir.glob(f"{guid}.*"))
if matches:
return f"{source_dir.name}/{matches[0].name}"
return source_path
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
return [
(
index == 0,
setting,
repub.utils.published_image_path(source_url, setting),
)
for index, setting in enumerate(self.get_image_settings())
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def image_variant(
self,
*,
path: str,
mimetype: str,
width: int,
height: int,
file_size: int,
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": mimetype,
"medium": repub.utils.FileType.IMAGE.value,
"isDefault": "true" if is_default else "false",
"fileSize": file_size,
"width": width,
"height": height,
}
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
width, height, file_size, mimetype = image_variant_meta(
file_path,
fallback_mimetype=setting["mimetype"],
)
variants.append(
self.image_variant(
path=path,
mimetype=mimetype or setting["mimetype"],
width=width,
height=height,
file_size=file_size,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
response=None,
item=None,
) -> TranscodedImageFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"source_path": self.resolve_source_path(request, response),
"variants": self.load_variants_from_disk(request, item=item),
"thumbnails": [],
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.resolve_source_path(request), info),
):
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
source_file_path = self.local_store_path(self.source_path(request, response))
source_buf = BytesIO(response.body)
source_image = load_image_from_buffer(response.body).autorot()
if source_image.width < self.min_width or source_image.height < self.min_height:
raise ImageException(
"Image too small "
f"({source_image.width}x{source_image.height} < "
f"{self.min_width}x{self.min_height})"
)
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.source_path(request, response), info),
):
self.store.persist_file(
self.source_path(request, response),
source_buf,
info,
meta={"width": source_image.width, "height": source_image.height},
headers={
"Content-Type": image_loader_mimetype(
image_loader_name(source_image),
image_mimetype(response, url=request.url),
)
or "application/octet-stream"
},
)
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
canonical_checksum = None
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
if stat:
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
out_buf = render_image_profile(source_file_path, setting)
width, height, file_size, _ = image_buffer_meta(
out_buf.getvalue(),
fallback_mimetype=setting["mimetype"],
)
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta={"width": width, "height": height, "fileSize": file_size},
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
response=response,
item=item,
)
class ImageThumbnailPipeline:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
self.store_dir = Path(store_uri)
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
def local_store_path(self, path: str) -> Path:
return self.store_dir / path
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def persist_thumbnail(
self, source_file: Path, final_path: str, profile: dict[str, Any]
):
out_buf = render_image_profile(source_file, profile)
target = self.local_store_path(final_path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(out_buf.getvalue())
def load_thumbnail(
self,
*,
source_url: str,
profile: dict[str, Any],
item=None,
) -> ThumbnailVariant | None:
final_path = repub.utils.thumbnail_image_path(source_url, profile)
file_path = self.local_store_path(final_path)
if not file_path.exists():
return None
width, height, _, mimetype = image_variant_meta(
file_path,
fallback_mimetype=profile["mimetype"],
)
return {
"url": self.published_url(final_path, item),
"path": final_path,
"slot": str(profile["name"]),
"type": mimetype or profile["mimetype"],
"width": width,
"height": height,
}
def process_item(self, item, spider):
del spider
if not getattr(item, "images", None):
return item
for image in item.images:
source_path = image.get("source_path")
if not source_path:
image["thumbnails"] = []
continue
source_file = self.local_store_path(source_path)
thumbnails: list[ThumbnailVariant] = []
for profile in self.get_thumbnail_settings():
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
if not self.local_store_path(final_path).exists():
try:
self.persist_thumbnail(source_file, final_path, profile)
except ImageException as exc:
logger.warning(
"Failed to generate thumbnail for %s: %s", image["url"], exc
)
continue
thumbnail = self.load_thumbnail(
source_url=image["url"],
profile=profile,
item=item,
)
if thumbnail is not None:
thumbnails.append(thumbnail)
image["thumbnails"] = thumbnails
return item
ImagePipeline = ImageNormalizePipeline
class FilePipeline(BaseFilesPipeline):
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_file_path(request.url)
def read_asset(file_path: str | Path) -> BytesIO:
buf_converted = BytesIO()
with open(file_path, "rb") as f:
buf_converted.write(f.read())
buf_converted.seek(0)
return buf_converted
def buffer_checksum(buf: BytesIO) -> str:
buf.seek(0)
checksum = hashlib.md5(buf.read(), usedforsecurity=False).hexdigest() # nosec
buf.seek(0)
return checksum
class TranscodePipeline(BaseFilesPipeline):
def __init__(
self,
media_type: repub.utils.FileType,
store_uri: Union[str, PathLike],
*,
crawler: Crawler,
):
self.media_type = media_type
self.settings = crawler.settings
super().__init__(store_uri, crawler=crawler)
def transcode(
self, input_file: str, settings: media.MediaSettings, tmp_dir: str
) -> Optional[str]:
probe_result = media.probe_media(input_file)
params = self.get_transcode_params(probe_result, settings)
if params is not None:
return self.transcode_media(input_file, tmp_dir, params)
logger.info(
f"Skipping audio compression for {input_file}, it meets requirements"
)
return None
def get_media_settings(self) -> List[media.MediaSettings]:
raise NotImplementedError()
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
raise NotImplementedError()
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
raise NotImplementedError()
def get_media_meta(self, probe_result) -> media.MediaMeta:
raise NotImplementedError()
def media_dir(self) -> str:
setting_name = {
repub.utils.FileType.AUDIO: "REPUBLISHER_AUDIO_DIR",
repub.utils.FileType.VIDEO: "REPUBLISHER_VIDEO_DIR",
}.get(self.media_type)
if setting_name is None:
raise ValueError(f"Unsupported media type: {self.media_type}")
return self.settings[setting_name]
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_media_path(
self.media_type,
request.url,
self.get_media_settings(),
)
def variant_paths(
self, source_url: str
) -> list[tuple[bool, media.MediaSettings, str]]:
settings = self.get_media_settings()
return [
(
index == 0,
setting,
repub.utils.published_media_path(self.media_type, source_url, setting),
)
for index, setting in enumerate(settings)
]
def original_path(self, source_url: str) -> str:
if self.media_type == repub.utils.FileType.AUDIO:
return repub.utils.local_audio_path(source_url)
if self.media_type == repub.utils.FileType.VIDEO:
return repub.utils.local_video_path(source_url)
raise ValueError(f"Unsupported media type: {self.media_type}")
def original_mimetype(self, source_url: str, response=None) -> str:
if response is not None:
content_type = response.headers.get(b"Content-Type")
if content_type:
return content_type.decode("utf-8").split(";", 1)[0].strip()
mimetype = mimetypes.guess_type(source_url)[0]
if mimetype:
return mimetype
return {
repub.utils.FileType.AUDIO: "audio/mpeg",
repub.utils.FileType.VIDEO: "video/mp4",
}[self.media_type]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.media_dir()}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def media_variant(
self,
*,
path: str,
mimetype: str,
probe_result: dict[str, Any],
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": mimetype,
"medium": self.media_type.value,
"isDefault": "true" if is_default else "false",
}
meta = self.get_media_meta(probe_result) or {}
for key, value in meta.items():
if value not in (None, ""):
variant[key] = value
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
probe_result = media.probe_media(str(file_path))
variants.append(
self.media_variant(
path=path,
mimetype=setting["mimetype"],
probe_result=probe_result,
is_default=is_default,
item=item,
)
)
original_path = self.original_path(request.url)
original_file = self.local_store_path(original_path)
if original_file.exists():
variants.append(
self.media_variant(
path=original_path,
mimetype=self.original_mimetype(request.url),
probe_result=media.probe_media(str(original_file)),
is_default=False,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
item=None,
) -> TranscodedMediaFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"variants": self.load_variants_from_disk(request, item=item),
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.original_path(request.url), info),
):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
canonical_checksum = None
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_file = f"{tmp_dir}/original"
with open(tmp_file, "wb") as f:
f.write(response.body)
original_path = self.original_path(request.url)
if not cast(
dict[str, Any] | None,
self.store.stat_file(original_path, info),
):
original_buf = read_asset(tmp_file)
self.store.persist_file(
original_path,
original_buf,
info,
meta=self.get_media_meta(media.probe_media(tmp_file)),
headers={
"Content-Type": self.original_mimetype(
request.url, response=response
)
},
)
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(
dict[str, Any] | None,
self.store.stat_file(final_path, info),
)
if stat:
logger.info(f"Skipping, transcoded media exists at {final_path}")
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
out_file = self.transcode(tmp_file, setting, tmp_dir) or tmp_file
out_buf = read_asset(out_file)
probe_result = media.probe_media(out_file)
meta = self.get_media_meta(probe_result)
logger.info(f"{self.media_type} final {final_path} with {meta}")
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta=meta,
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
item=item,
)
class AudioPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "audio_urls"
DEFAULT_FILES_RESULT_FIELD = "audios"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["AUDIO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.AUDIO, store_uri, crawler=crawler)
def get_media_settings(self) -> List[media.AudioSettings]:
return self.settings["REPUBLISHER_AUDIO"]
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
return media.audio_transcode_params(probe_result, settings)
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
return media.transcode_audio(input_file, tmp_dir, params)
def get_media_meta(self, probe_result) -> Optional[media.AudioMeta]:
return media.audio_meta(probe_result)
class VideoPipeline(TranscodePipeline):
DEFAULT_FILES_URLS_FIELD = "video_urls"
DEFAULT_FILES_RESULT_FIELD = "videos"
@classmethod
def from_crawler(cls, crawler: Crawler):
cls._update_stores(crawler.settings)
return cls(crawler.settings["VIDEO_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
super().__init__(repub.utils.FileType.VIDEO, store_uri, crawler=crawler)
def get_media_settings(self) -> List[media.VideoSettings]:
return self.settings["REPUBLISHER_VIDEO"]
def get_transcode_params(self, probe_result, settings) -> Optional[Dict[str, str]]:
return media.video_transcode_params(probe_result, settings)
def transcode_media(self, input_file: str, tmp_dir: str, params) -> str:
return media.transcode_video(input_file, tmp_dir, params)
def get_media_meta(self, probe_result) -> Optional[media.VideoMeta]:
return media.video_meta(probe_result)