From 525393272e2afae81a9fe96cdd2754094662f781 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 09:24:22 +0200 Subject: [PATCH] Replace image pipeline with profile-driven variants - add image normalization profiles and thumbnail profiles - generate source, full-size variant, and thumbnail image artifacts - rewrite canonical image URLs through the first configured profile - emit explicit image Media RSS groups with named thumbnails - preserve legacy image paths when image conversion is disabled - cover cache-hit source paths, inline image handling, and thumbnail export --- repub/config.py | 22 +- repub/exporters.py | 56 ++++- repub/items.py | 30 ++- repub/pipelines.py | 434 +++++++++++++++++++++++++++++++--- repub/rss.py | 1 + repub/settings.py | 110 +++++++++ repub/spiders/rss_spider.py | 17 +- repub/static/app.css | 28 +++ repub/utils.py | 48 ++++ tests/test_config.py | 45 +++- tests/test_feed_validation.py | 306 +++++++++++++++++++++++- tests/test_file_feeds.py | 70 +++++- tests/test_pipelines.py | 256 +++++++++++++++----- 13 files changed, 1299 insertions(+), 124 deletions(-) diff --git a/repub/config.py b/repub/config.py index e9e86b3..d17c7d7 100644 --- a/repub/config.py +++ b/repub/config.py @@ -188,21 +188,31 @@ def build_feed_settings( video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR) + image_normalize_enabled = convert_images and base_settings.getbool( + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True + ) + image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool( + "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True + ) item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES")) item_pipelines.pop("repub.pipelines.ImagePipeline", None) + item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None) + item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None) item_pipelines.pop("repub.pipelines.AudioPipeline", None) item_pipelines.pop("repub.pipelines.VideoPipeline", None) item_pipelines.pop("repub.pipelines.FilePipeline", None) item_pipelines.update( { - "repub.pipelines.AudioPipeline": 2, - "repub.pipelines.FilePipeline": 4, + "repub.pipelines.AudioPipeline": 3, + "repub.pipelines.FilePipeline": 5, } ) - if convert_images: - item_pipelines["repub.pipelines.ImagePipeline"] = 1 + if image_normalize_enabled: + item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1 + if image_thumbnails_enabled: + item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2 if convert_video: - item_pipelines["repub.pipelines.VideoPipeline"] = 3 + item_pipelines["repub.pipelines.VideoPipeline"] = 4 settings = base_settings.copy() settings.setdict( { @@ -219,6 +229,8 @@ def build_feed_settings( "LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"), "HTTPCACHE_DIR": str(out_dir / "httpcache"), "REPUBLISHER_IMAGE_DIR": image_dir, + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled, + "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled, "REPUBLISHER_VIDEO_DIR": video_dir, "REPUBLISHER_AUDIO_DIR": audio_dir, "REPUBLISHER_FILE_DIR": file_dir, diff --git a/repub/exporters.py b/repub/exporters.py index 99b0663..ab954c9 100644 --- a/repub/exporters.py +++ b/repub/exporters.py @@ -9,12 +9,17 @@ from repub.items import ( ChannelElementItem, ElementItem, MediaVariant, + ThumbnailVariant, + TranscodedImageFile, TranscodedMediaFile, ) from repub.utils import FileType, determine_file_type MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text +MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text +ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text +ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text class RssExporter(BaseItemExporter): @@ -52,7 +57,9 @@ class RssExporter(BaseItemExporter): key: str(value) for key, value in attrib.items() if value not in (None, "") } - def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None: + def canonical_variant( + self, media_file: TranscodedMediaFile | TranscodedImageFile + ) -> MediaVariant | None: for variant in media_file["variants"]: if variant.get("isDefault") == "true": return variant @@ -92,6 +99,8 @@ class RssExporter(BaseItemExporter): def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]: fallbacks: dict[str, dict[str, str]] = {} managed_types: set[FileType] = set() + if self.managed_image_files(item): + managed_types.add(FileType.IMAGE) if item.audios: managed_types.add(FileType.AUDIO) if item.videos: @@ -100,6 +109,9 @@ class RssExporter(BaseItemExporter): return fallbacks for child in list(item.el): + if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types: + item.el.remove(child) + continue if child.tag == MEDIA_CONTENT_TAG: if self.owned_media_type(child, managed_types) is None: continue @@ -113,25 +125,43 @@ class RssExporter(BaseItemExporter): if child.tag != MEDIA_GROUP_TAG: continue + managed_image_group = False for media_content in list(child): if media_content.tag != MEDIA_CONTENT_TAG: continue - if self.owned_media_type(media_content, managed_types) is None: + owned_type = self.owned_media_type(media_content, managed_types) + if owned_type is None: continue + if owned_type == FileType.IMAGE: + managed_image_group = True fallbacks[media_content.get("url", "")] = { key: value for key, value in media_content.attrib.items() if key in {"expression", "lang"} } child.remove(media_content) + if managed_image_group: + for media_thumbnail in list(child): + if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG: + child.remove(media_thumbnail) if len(child) == 0: item.el.remove(child) return fallbacks + def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]: + media_image_urls = set(item.media_image_urls) + if not media_image_urls: + return [] + return [image for image in item.images if image["url"] in media_image_urls] + def append_media_groups( self, item: ElementItem, fallbacks: dict[str, dict[str, str]] ): - for media_file in [*item.audios, *item.videos]: + for media_file in [ + *self.managed_image_files(item), + *item.audios, + *item.videos, + ]: if not media_file["variants"]: continue fallback_attrib = fallbacks.get(media_file["published_url"], {}) @@ -141,7 +171,11 @@ class RssExporter(BaseItemExporter): **self.media_content_attrib(variant, fallback_attrib) ) for variant in media_file["variants"] - ] + ], + *[ + rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail)) + for thumbnail in media_file.get("thumbnails", []) + ], ) if group is not None: item.el.append(group) @@ -170,10 +204,22 @@ class RssExporter(BaseItemExporter): ) return attrib + def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]: + attrib = self.compact_attrib( + url=thumbnail.get("url"), + width=thumbnail.get("width"), + height=thumbnail.get("height"), + ) + if thumbnail.get("slot"): + attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"]) + if thumbnail.get("type"): + attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"]) + return attrib + def apply_transcoded_media(self, item: Any) -> None: if not isinstance(item, ElementItem): return - if not item.audios and not item.videos: + if not self.managed_image_files(item) and not item.audios and not item.videos: return self.rebuild_enclosures(item) fallbacks = self.strip_managed_media_nodes(item) diff --git a/repub/items.py b/repub/items.py index d5e77be..310da3f 100644 --- a/repub/items.py +++ b/repub/items.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, List, TypedDict @@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False): type: str medium: str isDefault: str - fileSize: str + fileSize: int | str bitrate: int | float | str samplingrate: int | str channels: int | str @@ -29,18 +29,39 @@ class TranscodedMediaFile(TypedDict): variants: List[MediaVariant] +class ThumbnailVariant(TypedDict, total=False): + url: str + path: str + width: int | str + height: int | str + slot: str + type: str + + +class TranscodedImageFile(TypedDict): + url: str + path: str + checksum: str | None + status: str + published_url: str + source_path: str + variants: List[MediaVariant] + thumbnails: List[ThumbnailVariant] + + @dataclass class ElementItem: feed_name: str el: Any image_urls: List[str] - images: List[Any] + images: List[TranscodedImageFile] file_urls: List[str] files: List[Any] audio_urls: List[str] audios: List[TranscodedMediaFile] video_urls: List[str] videos: List[TranscodedMediaFile] + media_image_urls: List[str] = field(default_factory=list) @dataclass @@ -48,4 +69,5 @@ class ChannelElementItem: feed_name: str el: Any image_urls: List[str] - images: List[Any] + images: List[TranscodedImageFile] + media_image_urls: List[str] = field(default_factory=list) diff --git a/repub/pipelines.py b/repub/pipelines.py index a32f527..69a6c73 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -16,7 +16,12 @@ from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline import repub.utils from repub import media -from repub.items import MediaVariant, TranscodedMediaFile +from repub.items import ( + MediaVariant, + ThumbnailVariant, + TranscodedImageFile, + TranscodedMediaFile, +) logger = logging.getLogger(__name__) @@ -34,34 +39,108 @@ def image_mimetype(response=None, *, url: str | None = None) -> str | None: return None -def convert_image_body_to_jpeg( - body: bytes, - *, - source_mimetype: str | None = None, -) -> tuple[BytesIO, int, int]: +def image_loader_name(image: Any) -> str: + if image.get_typeof("vips-loader"): + return str(image.get("vips-loader")) + return "" + + +def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None: + known = { + "jpegload": "image/jpeg", + "pngload": "image/png", + "gifload": "image/gif", + "svgload": "image/svg+xml", + "tiffload": "image/tiff", + "webpload": "image/webp", + "heifload": "image/heif", + "jxlload": "image/jxl", + } + for prefix, mimetype in known.items(): + if loader.startswith(prefix): + return mimetype + return fallback + + +def load_image_from_buffer(body: bytes) -> Any: try: - image = cast( + return cast( Any, pyvips.Image.new_from_buffer(body, "", access="sequential"), - ).autorot() + ) except pyvips.Error as exc: raise ImageException(str(exc)) from exc - width = image.width - height = image.height - loader = "" - if image.get_typeof("vips-loader"): - loader = str(image.get("vips-loader")) - if source_mimetype == "image/jpeg" or loader.startswith("jpegload"): - return BytesIO(body), width, height - if image.hasalpha(): - image = image.flatten(background=[255, 255, 255]) +def load_image_from_file(file_path: str | Path) -> Any: + try: + return cast( + Any, + pyvips.Image.new_from_file(str(file_path), access="sequential"), + ) + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + + +def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO: + transform = str(profile["transform"]) + transform_kwargs = dict(profile.get("transform_kwargs", {})) + width = int(transform_kwargs.pop("width")) + if transform == "thumbnail": + image = cast( + Any, + pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs), + ) + elif transform == "thumbnail_buffer": + image = cast( + Any, + pyvips.Image.thumbnail_buffer( + Path(source_path).read_bytes(), + width, + **transform_kwargs, + ), + ) + else: + raise ImageException(f"Unsupported image transform: {transform}") + image = image.colourspace("srgb") - return BytesIO(image.jpegsave_buffer()), width, height + if image.hasalpha() and ( + profile["mimetype"] == "image/jpeg" + or "background" in profile.get("save_kwargs", {}) + ): + image = image.flatten( + background=profile.get("save_kwargs", {}).get("background", [255, 255, 255]) + ) + + save_name = str(profile["save"]) + try: + image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {}))) + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + return BytesIO(cast(bytes, image_bytes)) -class ImagePipeline(BaseFilesPipeline): +def image_buffer_meta( + body: bytes, + *, + fallback_mimetype: str | None = None, +) -> tuple[int, int, int, str | None]: + image = load_image_from_buffer(body) + mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype) + return image.width, image.height, len(body), mimetype + + +def image_variant_meta( + file_path: str | Path, + *, + fallback_mimetype: str | None = None, +) -> tuple[int, int, int, str | None]: + image = load_image_from_file(file_path) + mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype) + return image.width, image.height, Path(file_path).stat().st_size, mimetype + + +class ImageNormalizePipeline(BaseFilesPipeline): MEDIA_NAME = "image" EXPIRES = 90 MIN_WIDTH = 0 @@ -100,29 +179,312 @@ class ImagePipeline(BaseFilesPipeline): self.MIN_HEIGHT, ) - def file_path(self, request, response=None, info=None, *, item=None): - return repub.utils.local_image_path(request.url) + def get_image_settings(self) -> list[dict[str, Any]]: + return list(self.settings["REPUBLISHER_IMAGE"]) - def file_downloaded(self, response, request, info, *, item=None): - path = self.file_path(request, response=response, info=info, item=item) - buf, width, height = convert_image_body_to_jpeg( - response.body, - source_mimetype=image_mimetype(response, url=request.url), + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.canonical_published_image_path( + request.url, + self.get_image_settings(), ) - if width < self.min_width or height < self.min_height: + + def source_path(self, request, response=None) -> str: + return repub.utils.source_image_path( + request.url, + image_mimetype(response, url=request.url), + ) + + def resolve_source_path(self, request, response=None) -> str: + source_path = self.source_path(request, response) + if response is not None: + return source_path + source_file = self.local_store_path(source_path) + if source_file.exists(): + return source_path + source_dir = self.local_store_path( + str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source")) + ) + guid = repub.utils.image_guid(request.url) + matches = sorted(source_dir.glob(f"{guid}.*")) + if matches: + return f"{source_dir.name}/{matches[0].name}" + return source_path + + def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]: + return [ + ( + index == 0, + setting, + repub.utils.published_image_path(source_url, setting), + ) + for index, setting in enumerate(self.get_image_settings()) + ] + + def published_url(self, path: str, item=None) -> str: + relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}" + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "" or item is None: + return relative_path + return f"{feed_url}/feeds/{item.feed_name}/{relative_path}" + + def local_store_path(self, path: str) -> Path: + return Path(cast(Any, self.store).basedir) / path + + def image_variant( + self, + *, + path: str, + mimetype: str, + width: int, + height: int, + file_size: int, + is_default: bool, + item=None, + ) -> MediaVariant: + variant: MediaVariant = { + "url": self.published_url(path, item), + "path": path, + "type": mimetype, + "medium": repub.utils.FileType.IMAGE.value, + "isDefault": "true" if is_default else "false", + "fileSize": file_size, + "width": width, + "height": height, + } + return variant + + def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]: + variants: list[MediaVariant] = [] + for is_default, setting, path in self.variant_paths(request.url): + file_path = self.local_store_path(path) + if not file_path.exists(): + continue + width, height, file_size, mimetype = image_variant_meta( + file_path, + fallback_mimetype=setting["mimetype"], + ) + variants.append( + self.image_variant( + path=path, + mimetype=mimetype or setting["mimetype"], + width=width, + height=height, + file_size=file_size, + is_default=is_default, + item=item, + ) + ) + return variants + + def make_file_result( + self, + request, + *, + checksum: str | None, + status: str, + response=None, + item=None, + ) -> TranscodedImageFile: + path = self.file_path(request, item=item) + return { + "url": request.url, + "path": path, + "published_url": self.published_url(path, item), + "checksum": checksum, + "status": status, + "source_path": self.resolve_source_path(request, response), + "variants": self.load_variants_from_disk(request, item=item), + "thumbnails": [], + } + + def media_to_download(self, request, info, *, item=None): + canonical_path = self.file_path(request, info=info, item=item) + canonical_stat = cast( + dict[str, Any] | None, + self.store.stat_file(canonical_path, info), + ) + if not canonical_stat: + return None + last_modified = canonical_stat.get("last_modified") + if not last_modified: + return None + age_days = (time.time() - last_modified) / 60 / 60 / 24 + if age_days > self.expires: + return None + if not cast( + dict[str, Any] | None, + self.store.stat_file(self.resolve_source_path(request), info), + ): + return None + for _, _, path in self.variant_paths(request.url): + if not cast(dict[str, Any] | None, self.store.stat_file(path, info)): + return None + self.inc_stats("uptodate") + return self.make_file_result( + request, + checksum=canonical_stat.get("checksum"), + status="uptodate", + item=item, + ) + + def persist_variants(self, response, request, info, *, item=None) -> str | None: + source_file_path = self.local_store_path(self.source_path(request, response)) + source_buf = BytesIO(response.body) + source_image = load_image_from_buffer(response.body).autorot() + if source_image.width < self.min_width or source_image.height < self.min_height: raise ImageException( "Image too small " - f"({width}x{height} < {self.min_width}x{self.min_height})" + f"({source_image.width}x{source_image.height} < " + f"{self.min_width}x{self.min_height})" ) - checksum = buffer_checksum(buf) - self.store.persist_file( - path, - buf, - info, - meta={"width": width, "height": height}, - headers={"Content-Type": "image/jpeg"}, + if not cast( + dict[str, Any] | None, + self.store.stat_file(self.source_path(request, response), info), + ): + self.store.persist_file( + self.source_path(request, response), + source_buf, + info, + meta={"width": source_image.width, "height": source_image.height}, + headers={ + "Content-Type": image_loader_mimetype( + image_loader_name(source_image), + image_mimetype(response, url=request.url), + ) + or "application/octet-stream" + }, + ) + canonical_path = self.file_path( + request, response=response, info=info, item=item ) - return checksum + canonical_checksum = None + for _, setting, final_path in self.variant_paths(request.url): + stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info)) + if stat: + if final_path == canonical_path: + canonical_checksum = stat.get("checksum") + continue + out_buf = render_image_profile(source_file_path, setting) + width, height, file_size, _ = image_buffer_meta( + out_buf.getvalue(), + fallback_mimetype=setting["mimetype"], + ) + checksum = buffer_checksum(out_buf) + self.store.persist_file( + final_path, + out_buf, + info, + meta={"width": width, "height": height, "fileSize": file_size}, + headers={"Content-Type": setting["mimetype"]}, + ) + if final_path == canonical_path: + canonical_checksum = checksum + return canonical_checksum + + def media_downloaded(self, response, request, info, *, item=None): + if response.status != 200: + raise FileException("download-error") + if not response.body: + raise FileException("empty-content") + status = "cached" if "cached" in response.flags else "downloaded" + self.inc_stats(status) + checksum = self.persist_variants(response, request, info, item=item) + return self.make_file_result( + request, + checksum=checksum, + status=status, + response=response, + item=item, + ) + + +class ImageThumbnailPipeline: + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler.settings["IMAGES_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + self.settings = crawler.settings + self.store_dir = Path(store_uri) + + def get_thumbnail_settings(self) -> list[dict[str, Any]]: + return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"]) + + def local_store_path(self, path: str) -> Path: + return self.store_dir / path + + def published_url(self, path: str, item=None) -> str: + relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}" + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "" or item is None: + return relative_path + return f"{feed_url}/feeds/{item.feed_name}/{relative_path}" + + def persist_thumbnail( + self, source_file: Path, final_path: str, profile: dict[str, Any] + ): + out_buf = render_image_profile(source_file, profile) + target = self.local_store_path(final_path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(out_buf.getvalue()) + + def load_thumbnail( + self, + *, + source_url: str, + profile: dict[str, Any], + item=None, + ) -> ThumbnailVariant | None: + final_path = repub.utils.thumbnail_image_path(source_url, profile) + file_path = self.local_store_path(final_path) + if not file_path.exists(): + return None + width, height, _, mimetype = image_variant_meta( + file_path, + fallback_mimetype=profile["mimetype"], + ) + return { + "url": self.published_url(final_path, item), + "path": final_path, + "slot": str(profile["name"]), + "type": mimetype or profile["mimetype"], + "width": width, + "height": height, + } + + def process_item(self, item, spider): + del spider + if not getattr(item, "images", None): + return item + for image in item.images: + source_path = image.get("source_path") + if not source_path: + image["thumbnails"] = [] + continue + source_file = self.local_store_path(source_path) + thumbnails: list[ThumbnailVariant] = [] + for profile in self.get_thumbnail_settings(): + final_path = repub.utils.thumbnail_image_path(image["url"], profile) + if not self.local_store_path(final_path).exists(): + try: + self.persist_thumbnail(source_file, final_path, profile) + except ImageException as exc: + logger.warning( + "Failed to generate thumbnail for %s: %s", image["url"], exc + ) + continue + thumbnail = self.load_thumbnail( + source_url=image["url"], + profile=profile, + item=item, + ) + if thumbnail is not None: + thumbnails.append(thumbnail) + image["thumbnails"] = thumbnails + return item + + +ImagePipeline = ImageNormalizePipeline class FilePipeline(BaseFilesPipeline): diff --git a/repub/rss.py b/repub/rss.py index b2274c0..4b0ba84 100644 --- a/repub/rss.py +++ b/repub/rss.py @@ -46,6 +46,7 @@ nsmap = { "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", "dc": "http://purl.org/dc/elements/1.1/", "atom": "http://www.w3.org/2005/Atom", + "anynews": "https://guardianproject.info/rss/anynews/1.0", } CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"]) diff --git a/repub/settings.py b/repub/settings.py index 252c974..5b0cfcb 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -100,6 +100,116 @@ LOG_LEVEL = "INFO" MEDIA_ALLOW_REDIRECTS = True +REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True +REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True + +REPUBLISHER_IMAGE_DIR = "images" +REPUBLISHER_IMAGE_FULL_SUBDIR = "full" +REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" +REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" + +REPUBLISHER_IMAGE = [ + { + "name": "main_webp", + "mimetype": "image/webp", + "extension": "webp", + "transform": "thumbnail", + "transform_kwargs": { + "width": 1600, + "height": 1600, + "size": "down", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "webpsave_buffer", + "save_kwargs": { + "Q": 82, + "preset": "photo", + "smart_subsample": True, + "effort": 4, + "alpha_q": 90, + "keep": "none", + }, + }, + { + "name": "fallback_jpeg", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 1600, + "height": 1600, + "size": "down", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 85, + "interlace": True, + "optimize_coding": True, + "trellis_quant": True, + "optimize_scans": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, +] + +REPUBLISHER_IMAGE_THUMBNAILS = [ + { + "name": "card_hero", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 640, + "height": 360, + "size": "down", + "crop": "attention", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 82, + "interlace": True, + "optimize_coding": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, + { + "name": "list_square", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 160, + "height": 160, + "size": "down", + "crop": "centre", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 78, + "interlace": True, + "optimize_coding": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, +] + REPUBLISHER_AUDIO = [ { "name": "mp3_vbr7_voice", diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index fa27317..5b11129 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -21,6 +21,7 @@ from repub.rss import ( ) from repub.utils import ( FileType, + canonical_published_image_path, canonical_published_media_path, determine_file_type, local_file_path, @@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider): local_path = local_file_path(url) if file_type == FileType.IMAGE: file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] - local_path = local_image_path(url) + image_profiles = ( + self.settings.get("REPUBLISHER_IMAGE") or [] + if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True) + else [] + ) + local_path = ( + canonical_published_image_path(url, image_profiles) + if image_profiles + else local_image_path(url) + ) elif file_type == FileType.VIDEO: file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] local_path = canonical_published_media_path( @@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider): def parse_entry(self, response, feed, entry): image_urls = [] + media_image_urls = [] file_urls = [] audio_urls = [] video_urls = [] @@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider): ) if entry.get("image"): image_urls.append(entry.get("image").href) + media_image_urls.append(entry.get("image").href) for enc in entry.enclosures: url = enc.get("href") file_type = determine_file_type(url=url, mimetype=enc.get("type")) @@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider): ) ) add_url(file_type, media.get("url")) + if file_type == FileType.IMAGE: + media_image_urls.append(media.get("url")) return ElementItem( feed_name=self.feed_name, el=item, @@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider): audios=[], video_urls=video_urls, videos=[], + media_image_urls=media_image_urls, ) WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)" diff --git a/repub/static/app.css b/repub/static/app.css index 94b02ed..9fa1cb3 100644 --- a/repub/static/app.css +++ b/repub/static/app.css @@ -419,6 +419,9 @@ .rotate-180 { rotate: 180deg; } + .transform { + transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,); + } .animate-pulse { animation: var(--animate-pulse); } @@ -1221,6 +1224,26 @@ inherits: false; initial-value: 0; } +@property --tw-rotate-x { + syntax: "*"; + inherits: false; +} +@property --tw-rotate-y { + syntax: "*"; + inherits: false; +} +@property --tw-rotate-z { + syntax: "*"; + inherits: false; +} +@property --tw-skew-x { + syntax: "*"; + inherits: false; +} +@property --tw-skew-y { + syntax: "*"; + inherits: false; +} @property --tw-space-y-reverse { syntax: "*"; inherits: false; @@ -1460,6 +1483,11 @@ --tw-translate-x: 0; --tw-translate-y: 0; --tw-translate-z: 0; + --tw-rotate-x: initial; + --tw-rotate-y: initial; + --tw-rotate-z: initial; + --tw-skew-x: initial; + --tw-skew-y: initial; --tw-space-y-reverse: 0; --tw-space-x-reverse: 0; --tw-divide-y-reverse: 0; diff --git a/repub/utils.py b/repub/utils.py index b8379a1..b443053 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -43,6 +43,50 @@ def local_audio_path(s: str) -> str: return local_file_path(s) +def image_guid(source_url: str) -> str: + return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec + + +def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str: + if mimetype_or_extension: + if mimetype_or_extension.startswith("."): + extension = mimetype_or_extension + elif "/" in mimetype_or_extension: + extension = mimetypes.guess_extension(mimetype_or_extension) or "" + else: + extension = f".{mimetype_or_extension.lstrip('.')}" + if extension == ".jpe": + return ".jpg" + return extension + guessed = Path(source_url).suffix + if guessed == ".jpe": + return ".jpg" + if guessed: + return guessed + return ".img" + + +def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str: + extension = image_extension(mimetype_or_extension, source_url) + return f"source/{image_guid(source_url)}{extension}" + + +def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str: + return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True) + + +def canonical_published_image_path( + source_url: str, profiles: Sequence[Mapping[str, Any]] +) -> str: + if not profiles: + raise ValueError("Missing image normalization profiles") + return published_image_path(source_url, profiles[0]) + + +def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str: + return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True) + + def profile_settings_hash(profile: Mapping[str, Any]) -> str: settings = { key: value @@ -65,6 +109,8 @@ def variant_media_path( def published_media_path( file_type: FileType, source_url: str, profile: Mapping[str, Any] ) -> str: + if file_type == FileType.IMAGE: + return published_image_path(source_url, profile) if file_type == FileType.AUDIO: return variant_media_path(local_audio_path(source_url), profile, hashed=True) if file_type == FileType.VIDEO: @@ -79,6 +125,8 @@ def canonical_published_media_path( raise ValueError(f"Missing transcode profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. + if file_type == FileType.IMAGE: + return canonical_published_image_path(source_url, profiles) return published_media_path(file_type, source_url, profiles[0]) diff --git a/tests/test_config.py b/tests/test_config.py index cc59799..1d5816b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion( convert_video=False, ) - assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"] + assert ( + "repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"] + ) + assert ( + "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"] + ) assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"] - assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2 - assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4 + assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False + assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False + assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3 + assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5 + + +def test_build_feed_settings_respects_image_pipeline_feature_flags( + tmp_path: Path, +) -> None: + out_dir = (tmp_path / "mirror").resolve() + config = RepublisherConfig( + config_path=tmp_path / "repub.toml", + out_dir=out_dir, + feeds=( + FeedConfig( + name="Guardian Project Podcast", + slug="gp-pod", + url="https://guardianproject.info/podcast/podcast.xml", + ), + ), + scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False}, + ) + + base_settings = build_base_settings(config) + feed_settings = build_feed_settings( + base_settings, + out_dir=out_dir, + feed_slug="gp-pod", + ) + + assert ( + feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1 + ) + assert ( + "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"] + ) diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 9e1f80b..f395770 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -16,10 +16,12 @@ from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, - local_image_path, local_video_path, + published_image_path, published_media_path, + thumbnail_image_path, ) RSS_DATE_PATTERN = re.compile( @@ -44,6 +46,7 @@ def _serialize_feed( "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, @@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" + image_main_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ) + image_fallback_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[1], + ) + image_thumbnail_path = thumbnail_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0], + ) audio_base_path = local_audio_path(source_audio) audio_default_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] @@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: ) def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": image_main_path, + "published_url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "checksum": "image-default", + "status": "downloaded", + "source_path": "source/ignored.png", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "path": image_main_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": "2345", + "width": "1200", + "height": "675", + }, + { + "url": _published_url( + "https://mirror.example", + f"images/{image_fallback_path}", + ), + "path": image_fallback_path, + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "fileSize": "3456", + "width": "1200", + "height": "675", + }, + ], + "thumbnails": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_thumbnail_path}", + ), + "path": image_thumbnail_path, + "slot": "card_hero", + "type": "image/jpeg", + "width": "640", + "height": "360", + } + ], + } + ] item.audios = [ { "url": source_audio, @@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: Tue, 31 Mar 2026 10:31:50 +0000 ]]> + @@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert last_build_date == item_pub_date assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false" assert channel.findtext("./image/url") == ( - f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}" + "https://mirror.example/feeds/demo/images/" + + canonical_published_image_path( + channel_image, + repub_settings.REPUBLISHER_IMAGE, + ) ) atom_self = channel.find("atom:link", namespaces=nsmap) @@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert root.find("./channel/item/media:content", namespaces=nsmap) is None media_groups = root.findall("./channel/item/media:group", namespaces=nsmap) - assert len(media_groups) == 2 + assert len(media_groups) == 3 + + image_group = next( + group + for group in media_groups + if group.find("media:thumbnail", namespaces=nsmap) is not None + ) + audio_group = next( + group + for group in media_groups + if group.findall("media:content", namespaces=nsmap) + and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio" + ) + video_group = next( + group + for group in media_groups + if group.findall("media:content", namespaces=nsmap) + and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video" + ) + + image_variants = image_group.findall("media:content", namespaces=nsmap) + assert [variant.attrib for variant in image_variants] == [ + { + "url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"), + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "expression": "full", + "lang": "en", + "height": "675", + "width": "1200", + "fileSize": "2345", + }, + { + "url": ( + f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}" + ), + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "expression": "full", + "lang": "en", + "height": "675", + "width": "1200", + "fileSize": "3456", + }, + ] + thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap) + assert len(thumbnails) == 1 + assert thumbnails[0].attrib == { + "url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"), + "width": "640", + "height": "360", + f"{{{nsmap['anynews']}}}slot": "card_hero", + f"{{{nsmap['anynews']}}}type": "image/jpeg", + } - audio_group, video_group = media_groups audio_variants = audio_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in audio_variants] == [ { @@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) assert itunes_image is not None assert itunes_image.attrib == { - "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}" + "href": ( + "https://mirror.example/feeds/demo/images/" + + canonical_published_image_path( + item_image, + repub_settings.REPUBLISHER_IMAGE, + ) + ) } itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap) @@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None: assert both_present.findtext("content:encoded", namespaces=nsmap) == ( "
Full body
" ) + + +def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None: + source_image = "https://source.example/media/inline.jpg" + + def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + "published_url": _published_url( + "https://mirror.example", + "images/" + + published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + ), + "checksum": "inline-image", + "status": "downloaded", + "source_path": "source/inline.jpg", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + "images/" + + published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + ), + "path": published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "width": "1200", + "height": "675", + "fileSize": "2345", + } + ], + "thumbnails": [], + } + ] + + _, root = _serialize_feed( + feed_url="https://mirror.example", + prepare_item=prepare_item, + feed_text=f""" + + + Demo Feed + https://source.example/feed + Demo description + + Inline Image Only + https://source.example/inline + inline-only + Tue, 31 Mar 2026 10:31:50 +0000 + ]]> + + + +""", + ) + + assert root.findall("./channel/item/media:group", namespaces=nsmap) == [] + + +def test_exporter_replaces_standalone_source_media_thumbnails() -> None: + source_image = "https://source.example/media/photo.jpg" + image_main_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ) + image_thumbnail_path = thumbnail_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0], + ) + + def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": image_main_path, + "published_url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "checksum": "image-default", + "status": "downloaded", + "source_path": "source/ignored.png", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "path": image_main_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": "2345", + "width": "1200", + "height": "675", + } + ], + "thumbnails": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_thumbnail_path}", + ), + "path": image_thumbnail_path, + "slot": "card_hero", + "type": "image/jpeg", + "width": "640", + "height": "360", + } + ], + } + ] + + _, root = _serialize_feed( + feed_url="https://mirror.example", + prepare_item=prepare_item, + feed_text=f""" + + + Demo Feed + https://source.example/feed + Demo description + + Entry One + https://source.example/entry-1 + entry-1 + Tue, 31 Mar 2026 10:31:50 +0000 + + + + + +""", + ) + + thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap) + assert thumbnails == [] + group_thumbnails = root.findall( + "./channel/item/media:group/media:thumbnail", + namespaces=nsmap, + ) + assert len(group_thumbnails) == 1 + assert group_thumbnails[0].get("url") == ( + f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}" + ) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index ff43b6a..27d198e 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -8,10 +8,13 @@ from repub import settings as repub_settings from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, local_image_path, local_video_path, + published_image_path, published_media_path, + thumbnail_image_path, ) @@ -57,14 +60,17 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) - assert ( - spider.rewrite_image_url("https://example.com/media/photo.jpg") - == f"images/{local_image_path('https://example.com/media/photo.jpg')}" + assert spider.rewrite_image_url( + "https://example.com/media/photo.jpg" + ) == "images/" + canonical_published_image_path( + "https://example.com/media/photo.jpg", + repub_settings.REPUBLISHER_IMAGE, ) assert spider.rewrite_file_url( FileType.AUDIO, @@ -90,6 +96,28 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: ) +def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> ( + None +): + spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") + spider.settings = Settings( + values={ + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_FILE_DIR": "files", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False, + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, + "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, + "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, + } + ) + + assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == ( + f"images/{local_image_path('https://example.com/media/photo.jpg')}" + ) + + def test_published_media_path_changes_when_profile_args_change() -> None: source_url = "https://example.com/media/clip.mp4" audio_profile = repub_settings.REPUBLISHER_AUDIO[0] @@ -113,6 +141,41 @@ def test_published_media_path_changes_when_profile_args_change() -> None: ) != published_media_path(FileType.VIDEO, source_url, base_profile) +def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None: + source_url = "https://example.com/media/photo.png" + base_image_profile = repub_settings.REPUBLISHER_IMAGE[0] + base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0] + + assert canonical_published_image_path( + source_url, + repub_settings.REPUBLISHER_IMAGE, + ) == published_image_path(source_url, base_image_profile) + + changed_image_profile = { + **base_image_profile, + "transform_kwargs": { + **base_image_profile["transform_kwargs"], + "width": 2048, + }, + } + assert published_image_path( + source_url, + changed_image_profile, + ) != published_image_path(source_url, base_image_profile) + + changed_thumbnail_profile = { + **base_thumbnail_profile, + "save_kwargs": { + **base_thumbnail_profile["save_kwargs"], + "Q": 60, + }, + } + assert thumbnail_image_path( + source_url, + changed_thumbnail_profile, + ) != thumbnail_image_path(source_url, base_thumbnail_profile) + + def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: feed_text = """ @@ -138,6 +201,7 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 0c1ec6b..821a8c6 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -20,17 +20,20 @@ from repub.items import ElementItem from repub.pipelines import ( AudioPipeline, FilePipeline, - ImagePipeline, + ImageNormalizePipeline, + ImageThumbnailPipeline, VideoPipeline, - convert_image_body_to_jpeg, image_mimetype, ) from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, - local_image_path, local_video_path, + published_image_path, published_media_path, + source_image_path, + thumbnail_image_path, ) @@ -54,8 +57,15 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace: return SimpleNamespace(settings=settings, request_fingerprinter=object()) +class HashableSpiderInfo: + __hash__ = object.__hash__ + + def __init__(self) -> None: + self.spider = SimpleNamespace() + + def spider_info() -> Any: - return SimpleNamespace(spider=SimpleNamespace()) + return HashableSpiderInfo() def store_dir(pipeline: Any) -> Path: @@ -66,13 +76,14 @@ def transparent_png_bytes() -> bytes: return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer() -def jpeg_bytes() -> bytes: - return cast(Any, pyvips.Image.black(4, 5, bands=3)).jpegsave_buffer(Q=90) +def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes: + return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer() @pytest.mark.parametrize( ("pipeline_cls", "store_setting"), [ + (ImageNormalizePipeline, "IMAGES_STORE"), (AudioPipeline, "AUDIO_STORE"), (VideoPipeline, "VIDEO_STORE"), (FilePipeline, "FILES_STORE"), @@ -647,39 +658,16 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert completed_item.audios == [result] -def test_convert_image_body_to_jpeg_flattens_alpha_png() -> None: - converted, width, height = convert_image_body_to_jpeg(transparent_png_bytes()) - - assert (width, height) == (2, 3) - assert converted.getvalue().startswith(b"\xff\xd8\xff") - - image = cast(Any, pyvips.Image.new_from_buffer(converted.getvalue(), "")) - assert image.width == 2 - assert image.height == 3 - assert image.bands == 3 - assert min(image.getpoint(0, 0)) >= 240 - - -def test_convert_image_body_to_jpeg_passthroughs_jpeg_bytes() -> None: - source = jpeg_bytes() - - converted, width, height = convert_image_body_to_jpeg(source) - - assert (width, height) == (4, 5) - assert converted.getvalue() == source - - def test_image_mimetype_does_not_guess_from_url_extension() -> None: assert image_mimetype(url="https://example.com/photo.jpg") is None -def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images( +def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants( monkeypatch, tmp_path: Path ) -> None: crawler = build_test_crawler(tmp_path) - pipeline = ImagePipeline.from_crawler(cast(Crawler, crawler)) + pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) - persisted: list[tuple[str, bytes, dict[str, Any] | None, str | None]] = [] source_url = "https://example.com/photo.png" item = ElementItem( feed_name="nasa", @@ -693,21 +681,179 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images video_urls=[], videos=[], ) - - def fake_persist_file(path, buf, info, meta=None, headers=None): - del info - persisted.append( - ( - path, - buf.getvalue(), - cast(dict[str, Any] | None, meta), - None if headers is None else headers.get("Content-Type"), - ) - ) - - monkeypatch.setattr(pipeline.store, "persist_file", fake_persist_file) + canonical_path = canonical_published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"], + ) + source_path = source_image_path(source_url, "image/png") + webp_path = published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"][0], + ) + jpeg_path = published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"][1], + ) + source_body = transparent_png_bytes() result = pipeline.media_downloaded( + Response( + url=source_url, + body=source_body, + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + webp_file_size = result["variants"][0].get("fileSize") + jpeg_file_size = result["variants"][1].get("fileSize") + + assert result == { + "url": source_url, + "path": canonical_path, + "published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}", + "checksum": result["checksum"], + "status": "downloaded", + "source_path": source_path, + "variants": [ + { + "url": f"https://mirror.example/feeds/nasa/images/{webp_path}", + "path": webp_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": webp_file_size, + "width": 2, + "height": 3, + }, + { + "url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}", + "path": jpeg_path, + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "fileSize": jpeg_file_size, + "width": 2, + "height": 3, + }, + ], + "thumbnails": [], + } + assert isinstance(result["checksum"], str) + assert isinstance(webp_file_size, int) + assert isinstance(jpeg_file_size, int) + assert (store_dir(pipeline) / source_path).read_bytes() == source_body + webp_image = cast( + Any, + pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)), + ) + jpeg_image = cast( + Any, + pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)), + ) + assert (webp_image.width, webp_image.height) == (2, 3) + assert (jpeg_image.width, jpeg_image.height) == (2, 3) + assert jpeg_image.bands == 3 + + completed_item = pipeline.item_completed([(True, result)], item, spider_info()) + assert completed_item.images == [result] + + +def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) + thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/photo.png" + source_body = png_bytes(1200, 900) + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + + normalized = normalize_pipeline.media_downloaded( + Response( + url=source_url, + body=source_body, + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + item.images = [normalized] + + processed = thumbnail_pipeline.process_item(item, spider_info().spider) + thumbnails = processed.images[0]["thumbnails"] + thumb_slots = [thumb.get("slot") for thumb in thumbnails] + first_thumb = thumbnails[0] + second_thumb = thumbnails[1] + + assert processed.images[0]["path"] == canonical_published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"], + ) + assert thumb_slots == ["card_hero", "list_square"] + assert first_thumb.get("path") == thumbnail_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0], + ) + assert first_thumb.get("type") == "image/jpeg" + assert first_thumb.get("width") == 640 + assert first_thumb.get("height") == 360 + assert second_thumb.get("path") == thumbnail_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1], + ) + assert second_thumb.get("width") == 160 + assert second_thumb.get("height") == 160 + for thumb in thumbnails: + thumb_path = thumb.get("path") + thumb_width = thumb.get("width") + thumb_height = thumb.get("height") + thumb_image = cast( + Any, + pyvips.Image.new_from_file( + str(store_dir(normalize_pipeline) / str(thumb_path)) + ), + ) + assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height) + + +def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/photo" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + + downloaded = pipeline.media_downloaded( Response( url=source_url, body=transparent_png_bytes(), @@ -719,25 +865,11 @@ def test_image_pipeline_media_downloaded_persists_converted_jpeg_and_sets_images item=item, ) - assert result == { - "url": source_url, - "path": local_image_path(source_url), - "checksum": result["checksum"], - "status": "downloaded", - } - assert isinstance(result["checksum"], str) - assert len(persisted) == 1 - assert persisted[0][0] == local_image_path(source_url) - assert persisted[0][2] == {"width": 2, "height": 3} - assert persisted[0][3] == "image/jpeg" + uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item) - image = cast(Any, pyvips.Image.new_from_buffer(persisted[0][1], "")) - assert image.width == 2 - assert image.height == 3 - assert image.bands == 3 - - completed_item = pipeline.item_completed([(True, result)], item, spider_info()) - assert completed_item.images == [result] + assert downloaded["source_path"].endswith(".png") + assert uptodate is not None + assert uptodate["source_path"] == downloaded["source_path"] def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants(