Replace image pipeline with profile-driven variants

- add image normalization profiles and thumbnail profiles
- generate source, full-size variant, and thumbnail image artifacts
- rewrite canonical image URLs through the first configured profile
- emit explicit image Media RSS groups with named thumbnails
- preserve legacy image paths when image conversion is disabled
- cover cache-hit source paths, inline image handling, and thumbnail export
This commit is contained in:
Abel Luck 2026-05-27 09:24:22 +02:00
parent 7316d4723f
commit 525393272e
13 changed files with 1299 additions and 124 deletions

View file

@ -16,7 +16,12 @@ from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline
import repub.utils
from repub import media
from repub.items import MediaVariant, TranscodedMediaFile
from repub.items import (
MediaVariant,
ThumbnailVariant,
TranscodedImageFile,
TranscodedMediaFile,
)
logger = logging.getLogger(__name__)
@ -34,34 +39,108 @@ def image_mimetype(response=None, *, url: str | None = None) -> str | None:
return None
def convert_image_body_to_jpeg(
body: bytes,
*,
source_mimetype: str | None = None,
) -> tuple[BytesIO, int, int]:
def image_loader_name(image: Any) -> str:
if image.get_typeof("vips-loader"):
return str(image.get("vips-loader"))
return ""
def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None:
known = {
"jpegload": "image/jpeg",
"pngload": "image/png",
"gifload": "image/gif",
"svgload": "image/svg+xml",
"tiffload": "image/tiff",
"webpload": "image/webp",
"heifload": "image/heif",
"jxlload": "image/jxl",
}
for prefix, mimetype in known.items():
if loader.startswith(prefix):
return mimetype
return fallback
def load_image_from_buffer(body: bytes) -> Any:
try:
image = cast(
return cast(
Any,
pyvips.Image.new_from_buffer(body, "", access="sequential"),
).autorot()
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
width = image.width
height = image.height
loader = ""
if image.get_typeof("vips-loader"):
loader = str(image.get("vips-loader"))
if source_mimetype == "image/jpeg" or loader.startswith("jpegload"):
return BytesIO(body), width, height
if image.hasalpha():
image = image.flatten(background=[255, 255, 255])
def load_image_from_file(file_path: str | Path) -> Any:
try:
return cast(
Any,
pyvips.Image.new_from_file(str(file_path), access="sequential"),
)
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO:
transform = str(profile["transform"])
transform_kwargs = dict(profile.get("transform_kwargs", {}))
width = int(transform_kwargs.pop("width"))
if transform == "thumbnail":
image = cast(
Any,
pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs),
)
elif transform == "thumbnail_buffer":
image = cast(
Any,
pyvips.Image.thumbnail_buffer(
Path(source_path).read_bytes(),
width,
**transform_kwargs,
),
)
else:
raise ImageException(f"Unsupported image transform: {transform}")
image = image.colourspace("srgb")
return BytesIO(image.jpegsave_buffer()), width, height
if image.hasalpha() and (
profile["mimetype"] == "image/jpeg"
or "background" in profile.get("save_kwargs", {})
):
image = image.flatten(
background=profile.get("save_kwargs", {}).get("background", [255, 255, 255])
)
save_name = str(profile["save"])
try:
image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {})))
except pyvips.Error as exc:
raise ImageException(str(exc)) from exc
return BytesIO(cast(bytes, image_bytes))
class ImagePipeline(BaseFilesPipeline):
def image_buffer_meta(
body: bytes,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_buffer(body)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, len(body), mimetype
def image_variant_meta(
file_path: str | Path,
*,
fallback_mimetype: str | None = None,
) -> tuple[int, int, int, str | None]:
image = load_image_from_file(file_path)
mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype)
return image.width, image.height, Path(file_path).stat().st_size, mimetype
class ImageNormalizePipeline(BaseFilesPipeline):
MEDIA_NAME = "image"
EXPIRES = 90
MIN_WIDTH = 0
@ -100,29 +179,312 @@ class ImagePipeline(BaseFilesPipeline):
self.MIN_HEIGHT,
)
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.local_image_path(request.url)
def get_image_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE"])
def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf, width, height = convert_image_body_to_jpeg(
response.body,
source_mimetype=image_mimetype(response, url=request.url),
def file_path(self, request, response=None, info=None, *, item=None):
return repub.utils.canonical_published_image_path(
request.url,
self.get_image_settings(),
)
if width < self.min_width or height < self.min_height:
def source_path(self, request, response=None) -> str:
return repub.utils.source_image_path(
request.url,
image_mimetype(response, url=request.url),
)
def resolve_source_path(self, request, response=None) -> str:
source_path = self.source_path(request, response)
if response is not None:
return source_path
source_file = self.local_store_path(source_path)
if source_file.exists():
return source_path
source_dir = self.local_store_path(
str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source"))
)
guid = repub.utils.image_guid(request.url)
matches = sorted(source_dir.glob(f"{guid}.*"))
if matches:
return f"{source_dir.name}/{matches[0].name}"
return source_path
def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]:
return [
(
index == 0,
setting,
repub.utils.published_image_path(source_url, setting),
)
for index, setting in enumerate(self.get_image_settings())
]
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def local_store_path(self, path: str) -> Path:
return Path(cast(Any, self.store).basedir) / path
def image_variant(
self,
*,
path: str,
mimetype: str,
width: int,
height: int,
file_size: int,
is_default: bool,
item=None,
) -> MediaVariant:
variant: MediaVariant = {
"url": self.published_url(path, item),
"path": path,
"type": mimetype,
"medium": repub.utils.FileType.IMAGE.value,
"isDefault": "true" if is_default else "false",
"fileSize": file_size,
"width": width,
"height": height,
}
return variant
def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]:
variants: list[MediaVariant] = []
for is_default, setting, path in self.variant_paths(request.url):
file_path = self.local_store_path(path)
if not file_path.exists():
continue
width, height, file_size, mimetype = image_variant_meta(
file_path,
fallback_mimetype=setting["mimetype"],
)
variants.append(
self.image_variant(
path=path,
mimetype=mimetype or setting["mimetype"],
width=width,
height=height,
file_size=file_size,
is_default=is_default,
item=item,
)
)
return variants
def make_file_result(
self,
request,
*,
checksum: str | None,
status: str,
response=None,
item=None,
) -> TranscodedImageFile:
path = self.file_path(request, item=item)
return {
"url": request.url,
"path": path,
"published_url": self.published_url(path, item),
"checksum": checksum,
"status": status,
"source_path": self.resolve_source_path(request, response),
"variants": self.load_variants_from_disk(request, item=item),
"thumbnails": [],
}
def media_to_download(self, request, info, *, item=None):
canonical_path = self.file_path(request, info=info, item=item)
canonical_stat = cast(
dict[str, Any] | None,
self.store.stat_file(canonical_path, info),
)
if not canonical_stat:
return None
last_modified = canonical_stat.get("last_modified")
if not last_modified:
return None
age_days = (time.time() - last_modified) / 60 / 60 / 24
if age_days > self.expires:
return None
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.resolve_source_path(request), info),
):
return None
for _, _, path in self.variant_paths(request.url):
if not cast(dict[str, Any] | None, self.store.stat_file(path, info)):
return None
self.inc_stats("uptodate")
return self.make_file_result(
request,
checksum=canonical_stat.get("checksum"),
status="uptodate",
item=item,
)
def persist_variants(self, response, request, info, *, item=None) -> str | None:
source_file_path = self.local_store_path(self.source_path(request, response))
source_buf = BytesIO(response.body)
source_image = load_image_from_buffer(response.body).autorot()
if source_image.width < self.min_width or source_image.height < self.min_height:
raise ImageException(
"Image too small "
f"({width}x{height} < {self.min_width}x{self.min_height})"
f"({source_image.width}x{source_image.height} < "
f"{self.min_width}x{self.min_height})"
)
checksum = buffer_checksum(buf)
self.store.persist_file(
path,
buf,
info,
meta={"width": width, "height": height},
headers={"Content-Type": "image/jpeg"},
if not cast(
dict[str, Any] | None,
self.store.stat_file(self.source_path(request, response), info),
):
self.store.persist_file(
self.source_path(request, response),
source_buf,
info,
meta={"width": source_image.width, "height": source_image.height},
headers={
"Content-Type": image_loader_mimetype(
image_loader_name(source_image),
image_mimetype(response, url=request.url),
)
or "application/octet-stream"
},
)
canonical_path = self.file_path(
request, response=response, info=info, item=item
)
return checksum
canonical_checksum = None
for _, setting, final_path in self.variant_paths(request.url):
stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info))
if stat:
if final_path == canonical_path:
canonical_checksum = stat.get("checksum")
continue
out_buf = render_image_profile(source_file_path, setting)
width, height, file_size, _ = image_buffer_meta(
out_buf.getvalue(),
fallback_mimetype=setting["mimetype"],
)
checksum = buffer_checksum(out_buf)
self.store.persist_file(
final_path,
out_buf,
info,
meta={"width": width, "height": height, "fileSize": file_size},
headers={"Content-Type": setting["mimetype"]},
)
if final_path == canonical_path:
canonical_checksum = checksum
return canonical_checksum
def media_downloaded(self, response, request, info, *, item=None):
if response.status != 200:
raise FileException("download-error")
if not response.body:
raise FileException("empty-content")
status = "cached" if "cached" in response.flags else "downloaded"
self.inc_stats(status)
checksum = self.persist_variants(response, request, info, item=item)
return self.make_file_result(
request,
checksum=checksum,
status=status,
response=response,
item=item,
)
class ImageThumbnailPipeline:
@classmethod
def from_crawler(cls, crawler: Crawler):
return cls(crawler.settings["IMAGES_STORE"], crawler=crawler)
def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler):
self.settings = crawler.settings
self.store_dir = Path(store_uri)
def get_thumbnail_settings(self) -> list[dict[str, Any]]:
return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"])
def local_store_path(self, path: str) -> Path:
return self.store_dir / path
def published_url(self, path: str, item=None) -> str:
relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}"
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
if feed_url == "" or item is None:
return relative_path
return f"{feed_url}/feeds/{item.feed_name}/{relative_path}"
def persist_thumbnail(
self, source_file: Path, final_path: str, profile: dict[str, Any]
):
out_buf = render_image_profile(source_file, profile)
target = self.local_store_path(final_path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_bytes(out_buf.getvalue())
def load_thumbnail(
self,
*,
source_url: str,
profile: dict[str, Any],
item=None,
) -> ThumbnailVariant | None:
final_path = repub.utils.thumbnail_image_path(source_url, profile)
file_path = self.local_store_path(final_path)
if not file_path.exists():
return None
width, height, _, mimetype = image_variant_meta(
file_path,
fallback_mimetype=profile["mimetype"],
)
return {
"url": self.published_url(final_path, item),
"path": final_path,
"slot": str(profile["name"]),
"type": mimetype or profile["mimetype"],
"width": width,
"height": height,
}
def process_item(self, item, spider):
del spider
if not getattr(item, "images", None):
return item
for image in item.images:
source_path = image.get("source_path")
if not source_path:
image["thumbnails"] = []
continue
source_file = self.local_store_path(source_path)
thumbnails: list[ThumbnailVariant] = []
for profile in self.get_thumbnail_settings():
final_path = repub.utils.thumbnail_image_path(image["url"], profile)
if not self.local_store_path(final_path).exists():
try:
self.persist_thumbnail(source_file, final_path, profile)
except ImageException as exc:
logger.warning(
"Failed to generate thumbnail for %s: %s", image["url"], exc
)
continue
thumbnail = self.load_thumbnail(
source_url=image["url"],
profile=profile,
item=item,
)
if thumbnail is not None:
thumbnails.append(thumbnail)
image["thumbnails"] = thumbnails
return item
ImagePipeline = ImageNormalizePipeline
class FilePipeline(BaseFilesPipeline):