diff --git a/flake.lock b/flake.lock index 86b48f0..f7a5277 100644 --- a/flake.lock +++ b/flake.lock @@ -2,16 +2,18 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1774386573, - "narHash": "sha256-4hAV26quOxdC6iyG7kYaZcM3VOskcPUrdCQd/nx8obc=", - "rev": "46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9", - "revCount": 969196, - "type": "tarball", - "url": "https://api.flakehub.com/f/pinned/NixOS/nixpkgs/0.1.969196%2Brev-46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9/019d279e-af65-79ce-92be-5dee7b1e36d4/source.tar.gz" + "lastModified": 1779622335, + "narHash": "sha256-ViA62qtL5za7V3d5I8OA9q9JcFhsVAiL5jVHwEclWqk=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "705e9929918b43bd7b715dc0a878ac870449bb03", + "type": "github" }, "original": { - "type": "tarball", - "url": "https://flakehub.com/f/NixOS/nixpkgs/0.1" + "owner": "nixos", + "ref": "nixos-26.05", + "repo": "nixpkgs", + "type": "github" } }, "pyproject-build-systems": { @@ -27,11 +29,11 @@ ] }, "locked": { - "lastModified": 1773870109, - "narHash": "sha256-ZoTdqZP03DcdoyxvpFHCAek4bkPUTUPUF3oCCgc3dP4=", + "lastModified": 1779676664, + "narHash": "sha256-MbXylBTkWqVm8/VYjoULtMoVRgWBN1gSHbeRKsOsPlU=", "owner": "pyproject-nix", "repo": "build-system-pkgs", - "rev": "b6e74f433b02fa4b8a7965ee24680f4867e2926f", + "rev": "7bff980f37fc24e09dbc986643719900c139bf12", "type": "github" }, "original": { @@ -47,11 +49,11 @@ ] }, "locked": { - "lastModified": 1774498001, - "narHash": "sha256-wTfdyzzrmpuqt4TQQNqilF91v0m5Mh1stNy9h7a/WK4=", + "lastModified": 1778901413, + "narHash": "sha256-GSKXTAnFqRAMlZkJrIPcQMYf+lpMr66K3i60mB9STvc=", "owner": "pyproject-nix", "repo": "pyproject.nix", - "rev": "794afa6eb588b498344f2eaa36ab1ceb7e6b0b09", + "rev": "a228447c3e179d477c1b6246ef3efa8cfe3c469a", "type": "github" }, "original": { @@ -76,11 +78,11 @@ ] }, "locked": { - "lastModified": 1773297127, - "narHash": "sha256-6E/yhXP7Oy/NbXtf1ktzmU8SdVqJQ09HC/48ebEGBpk=", + "lastModified": 1775636079, + "narHash": "sha256-pc20NRoMdiar8oPQceQT47UUZMBTiMdUuWrYu2obUP0=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "71b125cd05fbfd78cab3e070b73544abe24c5016", + "rev": "790751ff7fd3801feeaf96d7dc416a8d581265ba", "type": "github" }, "original": { @@ -99,11 +101,11 @@ ] }, "locked": { - "lastModified": 1774705889, - "narHash": "sha256-TRTIM18gP3ccBj3m8bV1zx82xeYweNYp8/lgcdR4Zz0=", + "lastModified": 1779411315, + "narHash": "sha256-IMFlxeyClau51KplhhSRGhdGTvD/knShHdybP1UOTuk=", "owner": "pyproject-nix", "repo": "uv2nix", - "rev": "28355ed75b466a15ff324e1baa151b550619fe67", + "rev": "fdf2a76275d7a9c27deb5d2f2ab33526ac9052ff", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 2d4cda9..c574d90 100644 --- a/flake.nix +++ b/flake.nix @@ -2,7 +2,7 @@ description = "republisher-redux - offline RSS and Atom feed mirroring"; inputs = { - nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/0.1"; + nixpkgs.url = "github:nixos/nixpkgs/nixos-26.05"; treefmt-nix = { url = "github:numtide/treefmt-nix"; inputs.nixpkgs.follows = "nixpkgs"; @@ -63,6 +63,12 @@ feedgen = prev.feedgen.overrideAttrs (old: { nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.setuptools ]; }); + pyvips = prev.pyvips.overrideAttrs (old: { + nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ + final.setuptools + final.pkgconfig + ]; + }); pygea = prev.pygea.overrideAttrs (old: { nativeBuildInputs = (old.nativeBuildInputs or [ ]) ++ [ final.hatchling @@ -108,6 +114,7 @@ checkPhase = '' runHook preCheck export HOME="$(mktemp -d)" + export LD_LIBRARY_PATH="${pkgs.lib.makeLibraryPath [ pkgs.vips ]}:$LD_LIBRARY_PATH" pytest tests/ -v runHook postCheck ''; @@ -125,7 +132,8 @@ postBuild = '' rm -f "$out/bin/repub" makeWrapper "${baseVenv}/bin/repub" "$out/bin/repub" \ - --prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" + --prefix PATH : "${pkgs.lib.makeBinPath [ ffmpegPackage ]}" \ + --prefix LD_LIBRARY_PATH : "${pkgs.lib.makeLibraryPath [ pkgs.vips ]}" ''; meta.mainProgram = "repub"; }; @@ -273,12 +281,14 @@ packages = [ pkgs.tailwindcss_4 pkgs.python313 + pkgs.vips pkgs.uv pkgs.pyright (mkFfmpegPackage pkgs) ]; env.LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ pkgs.stdenv.cc.cc + pkgs.vips ]; env.UV_PROJECT_ENVIRONMENT = ".venv"; env.UV_PYTHON_DOWNLOADS = "never"; diff --git a/pyproject.toml b/pyproject.toml index b87027b..baddc3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "colorlog>=6.8.2,<7.0.0", "feedparser>=6.0.11,<7.0.0", "lxml>=5.2.1,<6.0.0", - "pillow>=10.3.0,<11.0.0", + "pyvips>=3.0.0,<4.0.0", "ffmpeg-python>=0.2.0,<0.3.0", "Quart>=0.20.0,<0.21.0", "hypercorn>=0.18.0,<0.19.0", diff --git a/repub/config.py b/repub/config.py index e9e86b3..d17c7d7 100644 --- a/repub/config.py +++ b/repub/config.py @@ -188,21 +188,31 @@ def build_feed_settings( video_dir = base_settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR) audio_dir = base_settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR) file_dir = base_settings.get("REPUBLISHER_FILE_DIR", FILE_DIR) + image_normalize_enabled = convert_images and base_settings.getbool( + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True + ) + image_thumbnails_enabled = image_normalize_enabled and base_settings.getbool( + "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED", True + ) item_pipelines = dict(base_settings.getdict("ITEM_PIPELINES")) item_pipelines.pop("repub.pipelines.ImagePipeline", None) + item_pipelines.pop("repub.pipelines.ImageNormalizePipeline", None) + item_pipelines.pop("repub.pipelines.ImageThumbnailPipeline", None) item_pipelines.pop("repub.pipelines.AudioPipeline", None) item_pipelines.pop("repub.pipelines.VideoPipeline", None) item_pipelines.pop("repub.pipelines.FilePipeline", None) item_pipelines.update( { - "repub.pipelines.AudioPipeline": 2, - "repub.pipelines.FilePipeline": 4, + "repub.pipelines.AudioPipeline": 3, + "repub.pipelines.FilePipeline": 5, } ) - if convert_images: - item_pipelines["repub.pipelines.ImagePipeline"] = 1 + if image_normalize_enabled: + item_pipelines["repub.pipelines.ImageNormalizePipeline"] = 1 + if image_thumbnails_enabled: + item_pipelines["repub.pipelines.ImageThumbnailPipeline"] = 2 if convert_video: - item_pipelines["repub.pipelines.VideoPipeline"] = 3 + item_pipelines["repub.pipelines.VideoPipeline"] = 4 settings = base_settings.copy() settings.setdict( { @@ -219,6 +229,8 @@ def build_feed_settings( "LOG_FILE": str(out_dir / "logs" / f"{feed_slug}.log"), "HTTPCACHE_DIR": str(out_dir / "httpcache"), "REPUBLISHER_IMAGE_DIR": image_dir, + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": image_normalize_enabled, + "REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": image_thumbnails_enabled, "REPUBLISHER_VIDEO_DIR": video_dir, "REPUBLISHER_AUDIO_DIR": audio_dir, "REPUBLISHER_FILE_DIR": file_dir, diff --git a/repub/exporters.py b/repub/exporters.py index 99b0663..ab954c9 100644 --- a/repub/exporters.py +++ b/repub/exporters.py @@ -9,12 +9,17 @@ from repub.items import ( ChannelElementItem, ElementItem, MediaVariant, + ThumbnailVariant, + TranscodedImageFile, TranscodedMediaFile, ) from repub.utils import FileType, determine_file_type MEDIA_CONTENT_TAG = QName(rss.nsmap["media"], "content").text MEDIA_GROUP_TAG = QName(rss.nsmap["media"], "group").text +MEDIA_THUMBNAIL_TAG = QName(rss.nsmap["media"], "thumbnail").text +ANYNEWS_SLOT_ATTR = QName(rss.nsmap["anynews"], "slot").text +ANYNEWS_TYPE_ATTR = QName(rss.nsmap["anynews"], "type").text class RssExporter(BaseItemExporter): @@ -52,7 +57,9 @@ class RssExporter(BaseItemExporter): key: str(value) for key, value in attrib.items() if value not in (None, "") } - def canonical_variant(self, media_file: TranscodedMediaFile) -> MediaVariant | None: + def canonical_variant( + self, media_file: TranscodedMediaFile | TranscodedImageFile + ) -> MediaVariant | None: for variant in media_file["variants"]: if variant.get("isDefault") == "true": return variant @@ -92,6 +99,8 @@ class RssExporter(BaseItemExporter): def strip_managed_media_nodes(self, item: ElementItem) -> dict[str, dict[str, str]]: fallbacks: dict[str, dict[str, str]] = {} managed_types: set[FileType] = set() + if self.managed_image_files(item): + managed_types.add(FileType.IMAGE) if item.audios: managed_types.add(FileType.AUDIO) if item.videos: @@ -100,6 +109,9 @@ class RssExporter(BaseItemExporter): return fallbacks for child in list(item.el): + if child.tag == MEDIA_THUMBNAIL_TAG and FileType.IMAGE in managed_types: + item.el.remove(child) + continue if child.tag == MEDIA_CONTENT_TAG: if self.owned_media_type(child, managed_types) is None: continue @@ -113,25 +125,43 @@ class RssExporter(BaseItemExporter): if child.tag != MEDIA_GROUP_TAG: continue + managed_image_group = False for media_content in list(child): if media_content.tag != MEDIA_CONTENT_TAG: continue - if self.owned_media_type(media_content, managed_types) is None: + owned_type = self.owned_media_type(media_content, managed_types) + if owned_type is None: continue + if owned_type == FileType.IMAGE: + managed_image_group = True fallbacks[media_content.get("url", "")] = { key: value for key, value in media_content.attrib.items() if key in {"expression", "lang"} } child.remove(media_content) + if managed_image_group: + for media_thumbnail in list(child): + if media_thumbnail.tag == MEDIA_THUMBNAIL_TAG: + child.remove(media_thumbnail) if len(child) == 0: item.el.remove(child) return fallbacks + def managed_image_files(self, item: ElementItem) -> list[TranscodedImageFile]: + media_image_urls = set(item.media_image_urls) + if not media_image_urls: + return [] + return [image for image in item.images if image["url"] in media_image_urls] + def append_media_groups( self, item: ElementItem, fallbacks: dict[str, dict[str, str]] ): - for media_file in [*item.audios, *item.videos]: + for media_file in [ + *self.managed_image_files(item), + *item.audios, + *item.videos, + ]: if not media_file["variants"]: continue fallback_attrib = fallbacks.get(media_file["published_url"], {}) @@ -141,7 +171,11 @@ class RssExporter(BaseItemExporter): **self.media_content_attrib(variant, fallback_attrib) ) for variant in media_file["variants"] - ] + ], + *[ + rss.MEDIA.thumbnail(**self.media_thumbnail_attrib(thumbnail)) + for thumbnail in media_file.get("thumbnails", []) + ], ) if group is not None: item.el.append(group) @@ -170,10 +204,22 @@ class RssExporter(BaseItemExporter): ) return attrib + def media_thumbnail_attrib(self, thumbnail: ThumbnailVariant) -> dict[str, str]: + attrib = self.compact_attrib( + url=thumbnail.get("url"), + width=thumbnail.get("width"), + height=thumbnail.get("height"), + ) + if thumbnail.get("slot"): + attrib[ANYNEWS_SLOT_ATTR] = str(thumbnail["slot"]) + if thumbnail.get("type"): + attrib[ANYNEWS_TYPE_ATTR] = str(thumbnail["type"]) + return attrib + def apply_transcoded_media(self, item: Any) -> None: if not isinstance(item, ElementItem): return - if not item.audios and not item.videos: + if not self.managed_image_files(item) and not item.audios and not item.videos: return self.rebuild_enclosures(item) fallbacks = self.strip_managed_media_nodes(item) diff --git a/repub/items.py b/repub/items.py index d5e77be..310da3f 100644 --- a/repub/items.py +++ b/repub/items.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, List, TypedDict @@ -8,7 +8,7 @@ class MediaVariant(TypedDict, total=False): type: str medium: str isDefault: str - fileSize: str + fileSize: int | str bitrate: int | float | str samplingrate: int | str channels: int | str @@ -29,18 +29,39 @@ class TranscodedMediaFile(TypedDict): variants: List[MediaVariant] +class ThumbnailVariant(TypedDict, total=False): + url: str + path: str + width: int | str + height: int | str + slot: str + type: str + + +class TranscodedImageFile(TypedDict): + url: str + path: str + checksum: str | None + status: str + published_url: str + source_path: str + variants: List[MediaVariant] + thumbnails: List[ThumbnailVariant] + + @dataclass class ElementItem: feed_name: str el: Any image_urls: List[str] - images: List[Any] + images: List[TranscodedImageFile] file_urls: List[str] files: List[Any] audio_urls: List[str] audios: List[TranscodedMediaFile] video_urls: List[str] videos: List[TranscodedMediaFile] + media_image_urls: List[str] = field(default_factory=list) @dataclass @@ -48,4 +69,5 @@ class ChannelElementItem: feed_name: str el: Any image_urls: List[str] - images: List[Any] + images: List[TranscodedImageFile] + media_image_urls: List[str] = field(default_factory=list) diff --git a/repub/pipelines.py b/repub/pipelines.py index c2b11e3..69a6c73 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,3 +1,4 @@ +import functools import hashlib import logging import mimetypes @@ -8,24 +9,482 @@ from os import PathLike from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast +import pyvips from scrapy.crawler import Crawler from scrapy.pipelines.files import FileException from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline -from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline import repub.utils from repub import media -from repub.items import MediaVariant, TranscodedMediaFile +from repub.items import ( + MediaVariant, + ThumbnailVariant, + TranscodedImageFile, + TranscodedMediaFile, +) logger = logging.getLogger(__name__) -class ImagePipeline(BaseImagesPipeline): - def file_path(self, request, response=None, info=None, *, item=None): - return repub.utils.local_image_path(request.url) +class ImageException(FileException): + """General image error exception""" - def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None): - raise NotImplementedError() + +def image_mimetype(response=None, *, url: str | None = None) -> str | None: + del url + if response is not None: + content_type = response.headers.get(b"Content-Type") + if content_type: + return content_type.decode("utf-8").split(";", 1)[0].strip() + return None + + +def image_loader_name(image: Any) -> str: + if image.get_typeof("vips-loader"): + return str(image.get("vips-loader")) + return "" + + +def image_loader_mimetype(loader: str, fallback: str | None = None) -> str | None: + known = { + "jpegload": "image/jpeg", + "pngload": "image/png", + "gifload": "image/gif", + "svgload": "image/svg+xml", + "tiffload": "image/tiff", + "webpload": "image/webp", + "heifload": "image/heif", + "jxlload": "image/jxl", + } + for prefix, mimetype in known.items(): + if loader.startswith(prefix): + return mimetype + return fallback + + +def load_image_from_buffer(body: bytes) -> Any: + try: + return cast( + Any, + pyvips.Image.new_from_buffer(body, "", access="sequential"), + ) + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + + +def load_image_from_file(file_path: str | Path) -> Any: + try: + return cast( + Any, + pyvips.Image.new_from_file(str(file_path), access="sequential"), + ) + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + + +def render_image_profile(source_path: str | Path, profile: dict[str, Any]) -> BytesIO: + transform = str(profile["transform"]) + transform_kwargs = dict(profile.get("transform_kwargs", {})) + width = int(transform_kwargs.pop("width")) + if transform == "thumbnail": + image = cast( + Any, + pyvips.Image.thumbnail(str(source_path), width, **transform_kwargs), + ) + elif transform == "thumbnail_buffer": + image = cast( + Any, + pyvips.Image.thumbnail_buffer( + Path(source_path).read_bytes(), + width, + **transform_kwargs, + ), + ) + else: + raise ImageException(f"Unsupported image transform: {transform}") + + image = image.colourspace("srgb") + if image.hasalpha() and ( + profile["mimetype"] == "image/jpeg" + or "background" in profile.get("save_kwargs", {}) + ): + image = image.flatten( + background=profile.get("save_kwargs", {}).get("background", [255, 255, 255]) + ) + + save_name = str(profile["save"]) + try: + image_bytes = getattr(image, save_name)(**dict(profile.get("save_kwargs", {}))) + except pyvips.Error as exc: + raise ImageException(str(exc)) from exc + return BytesIO(cast(bytes, image_bytes)) + + +def image_buffer_meta( + body: bytes, + *, + fallback_mimetype: str | None = None, +) -> tuple[int, int, int, str | None]: + image = load_image_from_buffer(body) + mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype) + return image.width, image.height, len(body), mimetype + + +def image_variant_meta( + file_path: str | Path, + *, + fallback_mimetype: str | None = None, +) -> tuple[int, int, int, str | None]: + image = load_image_from_file(file_path) + mimetype = image_loader_mimetype(image_loader_name(image), fallback_mimetype) + return image.width, image.height, Path(file_path).stat().st_size, mimetype + + +class ImageNormalizePipeline(BaseFilesPipeline): + MEDIA_NAME = "image" + EXPIRES = 90 + MIN_WIDTH = 0 + MIN_HEIGHT = 0 + DEFAULT_FILES_URLS_FIELD = "image_urls" + DEFAULT_FILES_RESULT_FIELD = "images" + + @classmethod + def from_crawler(cls, crawler: Crawler): + cls._update_stores(crawler.settings) + return cls(crawler.settings["IMAGES_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + self.settings = crawler.settings + super().__init__(store_uri, crawler=crawler) + resolve = functools.partial( + self._key_for_pipe, + base_class_name="ImagesPipeline", + settings=self.settings, + ) + self.expires = self.settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES) + self.files_urls_field = self.settings.get( + resolve("IMAGES_URLS_FIELD"), + self.DEFAULT_FILES_URLS_FIELD, + ) + self.files_result_field = self.settings.get( + resolve("IMAGES_RESULT_FIELD"), + self.DEFAULT_FILES_RESULT_FIELD, + ) + self.min_width = self.settings.getint( + resolve("IMAGES_MIN_WIDTH"), + self.MIN_WIDTH, + ) + self.min_height = self.settings.getint( + resolve("IMAGES_MIN_HEIGHT"), + self.MIN_HEIGHT, + ) + + def get_image_settings(self) -> list[dict[str, Any]]: + return list(self.settings["REPUBLISHER_IMAGE"]) + + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.canonical_published_image_path( + request.url, + self.get_image_settings(), + ) + + def source_path(self, request, response=None) -> str: + return repub.utils.source_image_path( + request.url, + image_mimetype(response, url=request.url), + ) + + def resolve_source_path(self, request, response=None) -> str: + source_path = self.source_path(request, response) + if response is not None: + return source_path + source_file = self.local_store_path(source_path) + if source_file.exists(): + return source_path + source_dir = self.local_store_path( + str(self.settings.get("REPUBLISHER_IMAGE_SOURCE_SUBDIR", "source")) + ) + guid = repub.utils.image_guid(request.url) + matches = sorted(source_dir.glob(f"{guid}.*")) + if matches: + return f"{source_dir.name}/{matches[0].name}" + return source_path + + def variant_paths(self, source_url: str) -> list[tuple[bool, dict[str, Any], str]]: + return [ + ( + index == 0, + setting, + repub.utils.published_image_path(source_url, setting), + ) + for index, setting in enumerate(self.get_image_settings()) + ] + + def published_url(self, path: str, item=None) -> str: + relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}" + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "" or item is None: + return relative_path + return f"{feed_url}/feeds/{item.feed_name}/{relative_path}" + + def local_store_path(self, path: str) -> Path: + return Path(cast(Any, self.store).basedir) / path + + def image_variant( + self, + *, + path: str, + mimetype: str, + width: int, + height: int, + file_size: int, + is_default: bool, + item=None, + ) -> MediaVariant: + variant: MediaVariant = { + "url": self.published_url(path, item), + "path": path, + "type": mimetype, + "medium": repub.utils.FileType.IMAGE.value, + "isDefault": "true" if is_default else "false", + "fileSize": file_size, + "width": width, + "height": height, + } + return variant + + def load_variants_from_disk(self, request, *, item=None) -> list[MediaVariant]: + variants: list[MediaVariant] = [] + for is_default, setting, path in self.variant_paths(request.url): + file_path = self.local_store_path(path) + if not file_path.exists(): + continue + width, height, file_size, mimetype = image_variant_meta( + file_path, + fallback_mimetype=setting["mimetype"], + ) + variants.append( + self.image_variant( + path=path, + mimetype=mimetype or setting["mimetype"], + width=width, + height=height, + file_size=file_size, + is_default=is_default, + item=item, + ) + ) + return variants + + def make_file_result( + self, + request, + *, + checksum: str | None, + status: str, + response=None, + item=None, + ) -> TranscodedImageFile: + path = self.file_path(request, item=item) + return { + "url": request.url, + "path": path, + "published_url": self.published_url(path, item), + "checksum": checksum, + "status": status, + "source_path": self.resolve_source_path(request, response), + "variants": self.load_variants_from_disk(request, item=item), + "thumbnails": [], + } + + def media_to_download(self, request, info, *, item=None): + canonical_path = self.file_path(request, info=info, item=item) + canonical_stat = cast( + dict[str, Any] | None, + self.store.stat_file(canonical_path, info), + ) + if not canonical_stat: + return None + last_modified = canonical_stat.get("last_modified") + if not last_modified: + return None + age_days = (time.time() - last_modified) / 60 / 60 / 24 + if age_days > self.expires: + return None + if not cast( + dict[str, Any] | None, + self.store.stat_file(self.resolve_source_path(request), info), + ): + return None + for _, _, path in self.variant_paths(request.url): + if not cast(dict[str, Any] | None, self.store.stat_file(path, info)): + return None + self.inc_stats("uptodate") + return self.make_file_result( + request, + checksum=canonical_stat.get("checksum"), + status="uptodate", + item=item, + ) + + def persist_variants(self, response, request, info, *, item=None) -> str | None: + source_file_path = self.local_store_path(self.source_path(request, response)) + source_buf = BytesIO(response.body) + source_image = load_image_from_buffer(response.body).autorot() + if source_image.width < self.min_width or source_image.height < self.min_height: + raise ImageException( + "Image too small " + f"({source_image.width}x{source_image.height} < " + f"{self.min_width}x{self.min_height})" + ) + if not cast( + dict[str, Any] | None, + self.store.stat_file(self.source_path(request, response), info), + ): + self.store.persist_file( + self.source_path(request, response), + source_buf, + info, + meta={"width": source_image.width, "height": source_image.height}, + headers={ + "Content-Type": image_loader_mimetype( + image_loader_name(source_image), + image_mimetype(response, url=request.url), + ) + or "application/octet-stream" + }, + ) + canonical_path = self.file_path( + request, response=response, info=info, item=item + ) + canonical_checksum = None + for _, setting, final_path in self.variant_paths(request.url): + stat = cast(dict[str, Any] | None, self.store.stat_file(final_path, info)) + if stat: + if final_path == canonical_path: + canonical_checksum = stat.get("checksum") + continue + out_buf = render_image_profile(source_file_path, setting) + width, height, file_size, _ = image_buffer_meta( + out_buf.getvalue(), + fallback_mimetype=setting["mimetype"], + ) + checksum = buffer_checksum(out_buf) + self.store.persist_file( + final_path, + out_buf, + info, + meta={"width": width, "height": height, "fileSize": file_size}, + headers={"Content-Type": setting["mimetype"]}, + ) + if final_path == canonical_path: + canonical_checksum = checksum + return canonical_checksum + + def media_downloaded(self, response, request, info, *, item=None): + if response.status != 200: + raise FileException("download-error") + if not response.body: + raise FileException("empty-content") + status = "cached" if "cached" in response.flags else "downloaded" + self.inc_stats(status) + checksum = self.persist_variants(response, request, info, item=item) + return self.make_file_result( + request, + checksum=checksum, + status=status, + response=response, + item=item, + ) + + +class ImageThumbnailPipeline: + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler.settings["IMAGES_STORE"], crawler=crawler) + + def __init__(self, store_uri: Union[str, PathLike], *, crawler: Crawler): + self.settings = crawler.settings + self.store_dir = Path(store_uri) + + def get_thumbnail_settings(self) -> list[dict[str, Any]]: + return list(self.settings["REPUBLISHER_IMAGE_THUMBNAILS"]) + + def local_store_path(self, path: str) -> Path: + return self.store_dir / path + + def published_url(self, path: str, item=None) -> str: + relative_path = f"{self.settings['REPUBLISHER_IMAGE_DIR']}/{path}" + feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/") + if feed_url == "" or item is None: + return relative_path + return f"{feed_url}/feeds/{item.feed_name}/{relative_path}" + + def persist_thumbnail( + self, source_file: Path, final_path: str, profile: dict[str, Any] + ): + out_buf = render_image_profile(source_file, profile) + target = self.local_store_path(final_path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(out_buf.getvalue()) + + def load_thumbnail( + self, + *, + source_url: str, + profile: dict[str, Any], + item=None, + ) -> ThumbnailVariant | None: + final_path = repub.utils.thumbnail_image_path(source_url, profile) + file_path = self.local_store_path(final_path) + if not file_path.exists(): + return None + width, height, _, mimetype = image_variant_meta( + file_path, + fallback_mimetype=profile["mimetype"], + ) + return { + "url": self.published_url(final_path, item), + "path": final_path, + "slot": str(profile["name"]), + "type": mimetype or profile["mimetype"], + "width": width, + "height": height, + } + + def process_item(self, item, spider): + del spider + if not getattr(item, "images", None): + return item + for image in item.images: + source_path = image.get("source_path") + if not source_path: + image["thumbnails"] = [] + continue + source_file = self.local_store_path(source_path) + thumbnails: list[ThumbnailVariant] = [] + for profile in self.get_thumbnail_settings(): + final_path = repub.utils.thumbnail_image_path(image["url"], profile) + if not self.local_store_path(final_path).exists(): + try: + self.persist_thumbnail(source_file, final_path, profile) + except ImageException as exc: + logger.warning( + "Failed to generate thumbnail for %s: %s", image["url"], exc + ) + continue + thumbnail = self.load_thumbnail( + source_url=image["url"], + profile=profile, + item=item, + ) + if thumbnail is not None: + thumbnails.append(thumbnail) + image["thumbnails"] = thumbnails + return item + + +ImagePipeline = ImageNormalizePipeline class FilePipeline(BaseFilesPipeline): diff --git a/repub/rss.py b/repub/rss.py index b2274c0..4b0ba84 100644 --- a/repub/rss.py +++ b/repub/rss.py @@ -46,6 +46,7 @@ nsmap = { "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", "dc": "http://purl.org/dc/elements/1.1/", "atom": "http://www.w3.org/2005/Atom", + "anynews": "https://guardianproject.info/rss/anynews/1.0", } CONTENT = SafeElementMaker(nsmap={None: nsmap["content"]}, namespace=nsmap["content"]) diff --git a/repub/settings.py b/repub/settings.py index 252c974..5b0cfcb 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -100,6 +100,116 @@ LOG_LEVEL = "INFO" MEDIA_ALLOW_REDIRECTS = True +REPUBLISHER_IMAGE_NORMALIZE_ENABLED = True +REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = True + +REPUBLISHER_IMAGE_DIR = "images" +REPUBLISHER_IMAGE_FULL_SUBDIR = "full" +REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" +REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" + +REPUBLISHER_IMAGE = [ + { + "name": "main_webp", + "mimetype": "image/webp", + "extension": "webp", + "transform": "thumbnail", + "transform_kwargs": { + "width": 1600, + "height": 1600, + "size": "down", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "webpsave_buffer", + "save_kwargs": { + "Q": 82, + "preset": "photo", + "smart_subsample": True, + "effort": 4, + "alpha_q": 90, + "keep": "none", + }, + }, + { + "name": "fallback_jpeg", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 1600, + "height": 1600, + "size": "down", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 85, + "interlace": True, + "optimize_coding": True, + "trellis_quant": True, + "optimize_scans": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, +] + +REPUBLISHER_IMAGE_THUMBNAILS = [ + { + "name": "card_hero", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 640, + "height": 360, + "size": "down", + "crop": "attention", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 82, + "interlace": True, + "optimize_coding": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, + { + "name": "list_square", + "mimetype": "image/jpeg", + "extension": "jpg", + "transform": "thumbnail", + "transform_kwargs": { + "width": 160, + "height": 160, + "size": "down", + "crop": "centre", + "no_rotate": False, + "linear": False, + "fail_on": "warning", + }, + "save": "jpegsave_buffer", + "save_kwargs": { + "Q": 78, + "interlace": True, + "optimize_coding": True, + "subsample_mode": "auto", + "keep": "none", + "background": [255, 255, 255], + }, + }, +] + REPUBLISHER_AUDIO = [ { "name": "mp3_vbr7_voice", diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index fa27317..5b11129 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -21,6 +21,7 @@ from repub.rss import ( ) from repub.utils import ( FileType, + canonical_published_image_path, canonical_published_media_path, determine_file_type, local_file_path, @@ -54,7 +55,16 @@ class BaseRssFeedSpider(Spider): local_path = local_file_path(url) if file_type == FileType.IMAGE: file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] - local_path = local_image_path(url) + image_profiles = ( + self.settings.get("REPUBLISHER_IMAGE") or [] + if self.settings.getbool("REPUBLISHER_IMAGE_NORMALIZE_ENABLED", True) + else [] + ) + local_path = ( + canonical_published_image_path(url, image_profiles) + if image_profiles + else local_image_path(url) + ) elif file_type == FileType.VIDEO: file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] local_path = canonical_published_media_path( @@ -278,6 +288,7 @@ class RssFeedSpider(BaseRssFeedSpider): def parse_entry(self, response, feed, entry): image_urls = [] + media_image_urls = [] file_urls = [] audio_urls = [] video_urls = [] @@ -323,6 +334,7 @@ class RssFeedSpider(BaseRssFeedSpider): ) if entry.get("image"): image_urls.append(entry.get("image").href) + media_image_urls.append(entry.get("image").href) for enc in entry.enclosures: url = enc.get("href") file_type = determine_file_type(url=url, mimetype=enc.get("type")) @@ -381,6 +393,8 @@ class RssFeedSpider(BaseRssFeedSpider): ) ) add_url(file_type, media.get("url")) + if file_type == FileType.IMAGE: + media_image_urls.append(media.get("url")) return ElementItem( feed_name=self.feed_name, el=item, @@ -392,6 +406,7 @@ class RssFeedSpider(BaseRssFeedSpider): audios=[], video_urls=video_urls, videos=[], + media_image_urls=media_image_urls, ) WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)" diff --git a/repub/static/app.css b/repub/static/app.css index 94b02ed..11ab841 100644 --- a/repub/static/app.css +++ b/repub/static/app.css @@ -1,4 +1,4 @@ -/*! tailwindcss v4.2.1 | MIT License | https://tailwindcss.com */ +/*! tailwindcss v4.3.0 | MIT License | https://tailwindcss.com */ @layer properties; @layer theme, base, components, utilities; @layer theme { @@ -245,9 +245,6 @@ .inset-x-0 { inset-inline: calc(var(--spacing) * 0); } - .start { - inset-inline-start: var(--spacing); - } .top-0 { top: calc(var(--spacing) * 0); } @@ -419,6 +416,9 @@ .rotate-180 { rotate: 180deg; } + .transform { + transform: var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,); + } .animate-pulse { animation: var(--animate-pulse); } @@ -1221,6 +1221,26 @@ inherits: false; initial-value: 0; } +@property --tw-rotate-x { + syntax: "*"; + inherits: false; +} +@property --tw-rotate-y { + syntax: "*"; + inherits: false; +} +@property --tw-rotate-z { + syntax: "*"; + inherits: false; +} +@property --tw-skew-x { + syntax: "*"; + inherits: false; +} +@property --tw-skew-y { + syntax: "*"; + inherits: false; +} @property --tw-space-y-reverse { syntax: "*"; inherits: false; @@ -1460,6 +1480,11 @@ --tw-translate-x: 0; --tw-translate-y: 0; --tw-translate-z: 0; + --tw-rotate-x: initial; + --tw-rotate-y: initial; + --tw-rotate-z: initial; + --tw-skew-x: initial; + --tw-skew-y: initial; --tw-space-y-reverse: 0; --tw-space-x-reverse: 0; --tw-divide-y-reverse: 0; diff --git a/repub/utils.py b/repub/utils.py index b8379a1..b443053 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -43,6 +43,50 @@ def local_audio_path(s: str) -> str: return local_file_path(s) +def image_guid(source_url: str) -> str: + return hashlib.sha1(to_bytes(source_url)).hexdigest() # nosec + + +def image_extension(mimetype_or_extension: str | None, source_url: str = "") -> str: + if mimetype_or_extension: + if mimetype_or_extension.startswith("."): + extension = mimetype_or_extension + elif "/" in mimetype_or_extension: + extension = mimetypes.guess_extension(mimetype_or_extension) or "" + else: + extension = f".{mimetype_or_extension.lstrip('.')}" + if extension == ".jpe": + return ".jpg" + return extension + guessed = Path(source_url).suffix + if guessed == ".jpe": + return ".jpg" + if guessed: + return guessed + return ".img" + + +def source_image_path(source_url: str, mimetype_or_extension: str | None = None) -> str: + extension = image_extension(mimetype_or_extension, source_url) + return f"source/{image_guid(source_url)}{extension}" + + +def published_image_path(source_url: str, profile: Mapping[str, Any]) -> str: + return variant_media_path(f"full/{image_guid(source_url)}", profile, hashed=True) + + +def canonical_published_image_path( + source_url: str, profiles: Sequence[Mapping[str, Any]] +) -> str: + if not profiles: + raise ValueError("Missing image normalization profiles") + return published_image_path(source_url, profiles[0]) + + +def thumbnail_image_path(source_url: str, profile: Mapping[str, Any]) -> str: + return variant_media_path(f"thumbs/{image_guid(source_url)}", profile, hashed=True) + + def profile_settings_hash(profile: Mapping[str, Any]) -> str: settings = { key: value @@ -65,6 +109,8 @@ def variant_media_path( def published_media_path( file_type: FileType, source_url: str, profile: Mapping[str, Any] ) -> str: + if file_type == FileType.IMAGE: + return published_image_path(source_url, profile) if file_type == FileType.AUDIO: return variant_media_path(local_audio_path(source_url), profile, hashed=True) if file_type == FileType.VIDEO: @@ -79,6 +125,8 @@ def canonical_published_media_path( raise ValueError(f"Missing transcode profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. + if file_type == FileType.IMAGE: + return canonical_published_image_path(source_url, profiles) return published_media_path(file_type, source_url, profiles[0]) diff --git a/tests/test_config.py b/tests/test_config.py index cc59799..1d5816b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -224,7 +224,46 @@ def test_build_feed_settings_can_disable_image_and_video_conversion( convert_video=False, ) - assert "repub.pipelines.ImagePipeline" not in feed_settings["ITEM_PIPELINES"] + assert ( + "repub.pipelines.ImageNormalizePipeline" not in feed_settings["ITEM_PIPELINES"] + ) + assert ( + "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"] + ) assert "repub.pipelines.VideoPipeline" not in feed_settings["ITEM_PIPELINES"] - assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 2 - assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 4 + assert feed_settings["REPUBLISHER_IMAGE_NORMALIZE_ENABLED"] is False + assert feed_settings["REPUBLISHER_IMAGE_THUMBNAILS_ENABLED"] is False + assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.AudioPipeline"] == 3 + assert feed_settings["ITEM_PIPELINES"]["repub.pipelines.FilePipeline"] == 5 + + +def test_build_feed_settings_respects_image_pipeline_feature_flags( + tmp_path: Path, +) -> None: + out_dir = (tmp_path / "mirror").resolve() + config = RepublisherConfig( + config_path=tmp_path / "repub.toml", + out_dir=out_dir, + feeds=( + FeedConfig( + name="Guardian Project Podcast", + slug="gp-pod", + url="https://guardianproject.info/podcast/podcast.xml", + ), + ), + scrapy_settings={"REPUBLISHER_IMAGE_THUMBNAILS_ENABLED": False}, + ) + + base_settings = build_base_settings(config) + feed_settings = build_feed_settings( + base_settings, + out_dir=out_dir, + feed_slug="gp-pod", + ) + + assert ( + feed_settings["ITEM_PIPELINES"]["repub.pipelines.ImageNormalizePipeline"] == 1 + ) + assert ( + "repub.pipelines.ImageThumbnailPipeline" not in feed_settings["ITEM_PIPELINES"] + ) diff --git a/tests/test_feed_validation.py b/tests/test_feed_validation.py index 9e1f80b..f395770 100644 --- a/tests/test_feed_validation.py +++ b/tests/test_feed_validation.py @@ -16,10 +16,12 @@ from repub.rss import nsmap from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, - local_image_path, local_video_path, + published_image_path, published_media_path, + thumbnail_image_path, ) RSS_DATE_PATTERN = re.compile( @@ -44,6 +46,7 @@ def _serialize_feed( "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, "REPUBLISHER_FEED_URL": feed_url, @@ -75,6 +78,18 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: source_video = "https://source.example/media/video.mp4" channel_image = "https://source.example/media/channel.png" item_image = "https://source.example/media/cover.jpg" + image_main_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ) + image_fallback_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[1], + ) + image_thumbnail_path = thumbnail_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0], + ) audio_base_path = local_audio_path(source_audio) audio_default_path = published_media_path( FileType.AUDIO, source_audio, repub_settings.REPUBLISHER_AUDIO[0] @@ -94,6 +109,60 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: ) def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": image_main_path, + "published_url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "checksum": "image-default", + "status": "downloaded", + "source_path": "source/ignored.png", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "path": image_main_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": "2345", + "width": "1200", + "height": "675", + }, + { + "url": _published_url( + "https://mirror.example", + f"images/{image_fallback_path}", + ), + "path": image_fallback_path, + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "fileSize": "3456", + "width": "1200", + "height": "675", + }, + ], + "thumbnails": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_thumbnail_path}", + ), + "path": image_thumbnail_path, + "slot": "card_hero", + "type": "image/jpeg", + "width": "640", + "height": "360", + } + ], + } + ] item.audios = [ { "url": source_audio, @@ -261,6 +330,7 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: Tue, 31 Mar 2026 10:31:50 +0000 ]]> + @@ -288,7 +358,11 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert last_build_date == item_pub_date assert channel.findtext("itunes:explicit", namespaces=nsmap) == "false" assert channel.findtext("./image/url") == ( - f"https://mirror.example/feeds/demo/images/{local_image_path(channel_image)}" + "https://mirror.example/feeds/demo/images/" + + canonical_published_image_path( + channel_image, + repub_settings.REPUBLISHER_IMAGE, + ) ) atom_self = channel.find("atom:link", namespaces=nsmap) @@ -318,9 +392,63 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: assert root.find("./channel/item/media:content", namespaces=nsmap) is None media_groups = root.findall("./channel/item/media:group", namespaces=nsmap) - assert len(media_groups) == 2 + assert len(media_groups) == 3 + + image_group = next( + group + for group in media_groups + if group.find("media:thumbnail", namespaces=nsmap) is not None + ) + audio_group = next( + group + for group in media_groups + if group.findall("media:content", namespaces=nsmap) + and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "audio" + ) + video_group = next( + group + for group in media_groups + if group.findall("media:content", namespaces=nsmap) + and group.findall("media:content", namespaces=nsmap)[0].get("medium") == "video" + ) + + image_variants = image_group.findall("media:content", namespaces=nsmap) + assert [variant.attrib for variant in image_variants] == [ + { + "url": (f"https://mirror.example/feeds/demo/images/" f"{image_main_path}"), + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "expression": "full", + "lang": "en", + "height": "675", + "width": "1200", + "fileSize": "2345", + }, + { + "url": ( + f"https://mirror.example/feeds/demo/images/" f"{image_fallback_path}" + ), + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "expression": "full", + "lang": "en", + "height": "675", + "width": "1200", + "fileSize": "3456", + }, + ] + thumbnails = image_group.findall("media:thumbnail", namespaces=nsmap) + assert len(thumbnails) == 1 + assert thumbnails[0].attrib == { + "url": (f"https://mirror.example/feeds/demo/images/" f"{image_thumbnail_path}"), + "width": "640", + "height": "360", + f"{{{nsmap['anynews']}}}slot": "card_hero", + f"{{{nsmap['anynews']}}}type": "image/jpeg", + } - audio_group, video_group = media_groups audio_variants = audio_group.findall("media:content", namespaces=nsmap) assert [variant.attrib for variant in audio_variants] == [ { @@ -428,7 +556,13 @@ def test_feed_generation_normalizes_dates_urls_and_xml_shapes() -> None: itunes_image = root.find("./channel/item/itunes:image", namespaces=nsmap) assert itunes_image is not None assert itunes_image.attrib == { - "href": f"https://mirror.example/feeds/demo/images/{local_image_path(item_image)}" + "href": ( + "https://mirror.example/feeds/demo/images/" + + canonical_published_image_path( + item_image, + repub_settings.REPUBLISHER_IMAGE, + ) + ) } itunes_summary = root.findtext("./channel/item/itunes:summary", namespaces=nsmap) @@ -494,3 +628,165 @@ def test_item_body_uses_description_only_when_content_is_also_present() -> None: assert both_present.findtext("content:encoded", namespaces=nsmap) == ( "
Full body
" ) + + +def test_exporter_does_not_emit_media_rss_for_inline_only_images() -> None: + source_image = "https://source.example/media/inline.jpg" + + def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + "published_url": _published_url( + "https://mirror.example", + "images/" + + published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + ), + "checksum": "inline-image", + "status": "downloaded", + "source_path": "source/inline.jpg", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + "images/" + + published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + ), + "path": published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ), + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "width": "1200", + "height": "675", + "fileSize": "2345", + } + ], + "thumbnails": [], + } + ] + + _, root = _serialize_feed( + feed_url="https://mirror.example", + prepare_item=prepare_item, + feed_text=f""" + + + Demo Feed + https://source.example/feed + Demo description + + Inline Image Only + https://source.example/inline + inline-only + Tue, 31 Mar 2026 10:31:50 +0000 + ]]> + + + +""", + ) + + assert root.findall("./channel/item/media:group", namespaces=nsmap) == [] + + +def test_exporter_replaces_standalone_source_media_thumbnails() -> None: + source_image = "https://source.example/media/photo.jpg" + image_main_path = published_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE[0], + ) + image_thumbnail_path = thumbnail_image_path( + source_image, + repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0], + ) + + def prepare_item(item: ElementItem) -> None: + item.images = [ + { + "url": source_image, + "path": image_main_path, + "published_url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "checksum": "image-default", + "status": "downloaded", + "source_path": "source/ignored.png", + "variants": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_main_path}", + ), + "path": image_main_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": "2345", + "width": "1200", + "height": "675", + } + ], + "thumbnails": [ + { + "url": _published_url( + "https://mirror.example", + f"images/{image_thumbnail_path}", + ), + "path": image_thumbnail_path, + "slot": "card_hero", + "type": "image/jpeg", + "width": "640", + "height": "360", + } + ], + } + ] + + _, root = _serialize_feed( + feed_url="https://mirror.example", + prepare_item=prepare_item, + feed_text=f""" + + + Demo Feed + https://source.example/feed + Demo description + + Entry One + https://source.example/entry-1 + entry-1 + Tue, 31 Mar 2026 10:31:50 +0000 + + + + + +""", + ) + + thumbnails = root.findall("./channel/item/media:thumbnail", namespaces=nsmap) + assert thumbnails == [] + group_thumbnails = root.findall( + "./channel/item/media:group/media:thumbnail", + namespaces=nsmap, + ) + assert len(group_thumbnails) == 1 + assert group_thumbnails[0].get("url") == ( + f"https://mirror.example/feeds/demo/images/{image_thumbnail_path}" + ) diff --git a/tests/test_file_feeds.py b/tests/test_file_feeds.py index ff43b6a..27d198e 100644 --- a/tests/test_file_feeds.py +++ b/tests/test_file_feeds.py @@ -8,10 +8,13 @@ from repub import settings as repub_settings from repub.spiders.rss_spider import RssFeedSpider from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, local_image_path, local_video_path, + published_image_path, published_media_path, + thumbnail_image_path, ) @@ -57,14 +60,17 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } ) - assert ( - spider.rewrite_image_url("https://example.com/media/photo.jpg") - == f"images/{local_image_path('https://example.com/media/photo.jpg')}" + assert spider.rewrite_image_url( + "https://example.com/media/photo.jpg" + ) == "images/" + canonical_published_image_path( + "https://example.com/media/photo.jpg", + repub_settings.REPUBLISHER_IMAGE, ) assert spider.rewrite_file_url( FileType.AUDIO, @@ -90,6 +96,28 @@ def test_rss_spider_rewrites_public_asset_urls_as_relative_paths() -> None: ) +def test_rss_spider_keeps_legacy_image_paths_when_image_normalization_disabled() -> ( + None +): + spider = RssFeedSpider(feed_name="demo", url="https://example.com/feed.rss") + spider.settings = Settings( + values={ + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_FILE_DIR": "files", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE_NORMALIZE_ENABLED": False, + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, + "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, + "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, + } + ) + + assert spider.rewrite_image_url("https://example.com/media/photo.jpg") == ( + f"images/{local_image_path('https://example.com/media/photo.jpg')}" + ) + + def test_published_media_path_changes_when_profile_args_change() -> None: source_url = "https://example.com/media/clip.mp4" audio_profile = repub_settings.REPUBLISHER_AUDIO[0] @@ -113,6 +141,41 @@ def test_published_media_path_changes_when_profile_args_change() -> None: ) != published_media_path(FileType.VIDEO, source_url, base_profile) +def test_published_image_and_thumbnail_paths_change_when_profile_args_change() -> None: + source_url = "https://example.com/media/photo.png" + base_image_profile = repub_settings.REPUBLISHER_IMAGE[0] + base_thumbnail_profile = repub_settings.REPUBLISHER_IMAGE_THUMBNAILS[0] + + assert canonical_published_image_path( + source_url, + repub_settings.REPUBLISHER_IMAGE, + ) == published_image_path(source_url, base_image_profile) + + changed_image_profile = { + **base_image_profile, + "transform_kwargs": { + **base_image_profile["transform_kwargs"], + "width": 2048, + }, + } + assert published_image_path( + source_url, + changed_image_profile, + ) != published_image_path(source_url, base_image_profile) + + changed_thumbnail_profile = { + **base_thumbnail_profile, + "save_kwargs": { + **base_thumbnail_profile["save_kwargs"], + "Q": 60, + }, + } + assert thumbnail_image_path( + source_url, + changed_thumbnail_profile, + ) != thumbnail_image_path(source_url, base_thumbnail_profile) + + def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: feed_text = """ @@ -138,6 +201,7 @@ def test_rss_spider_keeps_items_with_empty_content_encoded() -> None: "REPUBLISHER_FILE_DIR": "files", "REPUBLISHER_AUDIO_DIR": "audio", "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_IMAGE": repub_settings.REPUBLISHER_IMAGE, "REPUBLISHER_AUDIO": repub_settings.REPUBLISHER_AUDIO, "REPUBLISHER_VIDEO": repub_settings.REPUBLISHER_VIDEO, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 523f9bd..821a8c6 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -4,6 +4,7 @@ from types import SimpleNamespace from typing import Any, cast import pytest +import pyvips from scrapy.crawler import Crawler from scrapy.http import Request, Response @@ -16,12 +17,23 @@ from repub.config import ( build_feed_settings, ) from repub.items import ElementItem -from repub.pipelines import AudioPipeline, FilePipeline, VideoPipeline +from repub.pipelines import ( + AudioPipeline, + FilePipeline, + ImageNormalizePipeline, + ImageThumbnailPipeline, + VideoPipeline, + image_mimetype, +) from repub.utils import ( FileType, + canonical_published_image_path, local_audio_path, local_video_path, + published_image_path, published_media_path, + source_image_path, + thumbnail_image_path, ) @@ -45,17 +57,33 @@ def build_test_crawler(tmp_path: Path) -> SimpleNamespace: return SimpleNamespace(settings=settings, request_fingerprinter=object()) +class HashableSpiderInfo: + __hash__ = object.__hash__ + + def __init__(self) -> None: + self.spider = SimpleNamespace() + + def spider_info() -> Any: - return SimpleNamespace(spider=SimpleNamespace()) + return HashableSpiderInfo() def store_dir(pipeline: Any) -> Path: return Path(cast(Any, pipeline.store).basedir) +def transparent_png_bytes() -> bytes: + return cast(Any, pyvips.Image.black(2, 3, bands=4)).pngsave_buffer() + + +def png_bytes(width: int, height: int, *, bands: int = 4) -> bytes: + return cast(Any, pyvips.Image.black(width, height, bands=bands)).pngsave_buffer() + + @pytest.mark.parametrize( ("pipeline_cls", "store_setting"), [ + (ImageNormalizePipeline, "IMAGES_STORE"), (AudioPipeline, "AUDIO_STORE"), (VideoPipeline, "VIDEO_STORE"), (FilePipeline, "FILES_STORE"), @@ -630,6 +658,220 @@ def test_audio_pipeline_media_downloaded_returns_canonical_file_info_and_variant assert completed_item.audios == [result] +def test_image_mimetype_does_not_guess_from_url_extension() -> None: + assert image_mimetype(url="https://example.com/photo.jpg") is None + + +def test_image_normalize_pipeline_media_downloaded_persists_source_and_variants( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/photo.png" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + canonical_path = canonical_published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"], + ) + source_path = source_image_path(source_url, "image/png") + webp_path = published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"][0], + ) + jpeg_path = published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"][1], + ) + source_body = transparent_png_bytes() + + result = pipeline.media_downloaded( + Response( + url=source_url, + body=source_body, + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + webp_file_size = result["variants"][0].get("fileSize") + jpeg_file_size = result["variants"][1].get("fileSize") + + assert result == { + "url": source_url, + "path": canonical_path, + "published_url": f"https://mirror.example/feeds/nasa/images/{canonical_path}", + "checksum": result["checksum"], + "status": "downloaded", + "source_path": source_path, + "variants": [ + { + "url": f"https://mirror.example/feeds/nasa/images/{webp_path}", + "path": webp_path, + "type": "image/webp", + "medium": "image", + "isDefault": "true", + "fileSize": webp_file_size, + "width": 2, + "height": 3, + }, + { + "url": f"https://mirror.example/feeds/nasa/images/{jpeg_path}", + "path": jpeg_path, + "type": "image/jpeg", + "medium": "image", + "isDefault": "false", + "fileSize": jpeg_file_size, + "width": 2, + "height": 3, + }, + ], + "thumbnails": [], + } + assert isinstance(result["checksum"], str) + assert isinstance(webp_file_size, int) + assert isinstance(jpeg_file_size, int) + assert (store_dir(pipeline) / source_path).read_bytes() == source_body + webp_image = cast( + Any, + pyvips.Image.new_from_file(str(store_dir(pipeline) / webp_path)), + ) + jpeg_image = cast( + Any, + pyvips.Image.new_from_file(str(store_dir(pipeline) / jpeg_path)), + ) + assert (webp_image.width, webp_image.height) == (2, 3) + assert (jpeg_image.width, jpeg_image.height) == (2, 3) + assert jpeg_image.bands == 3 + + completed_item = pipeline.item_completed([(True, result)], item, spider_info()) + assert completed_item.images == [result] + + +def test_image_thumbnail_pipeline_generates_named_thumbnails_from_source_image( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + normalize_pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) + thumbnail_pipeline = ImageThumbnailPipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(normalize_pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/photo.png" + source_body = png_bytes(1200, 900) + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + + normalized = normalize_pipeline.media_downloaded( + Response( + url=source_url, + body=source_body, + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + item.images = [normalized] + + processed = thumbnail_pipeline.process_item(item, spider_info().spider) + thumbnails = processed.images[0]["thumbnails"] + thumb_slots = [thumb.get("slot") for thumb in thumbnails] + first_thumb = thumbnails[0] + second_thumb = thumbnails[1] + + assert processed.images[0]["path"] == canonical_published_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE"], + ) + assert thumb_slots == ["card_hero", "list_square"] + assert first_thumb.get("path") == thumbnail_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][0], + ) + assert first_thumb.get("type") == "image/jpeg" + assert first_thumb.get("width") == 640 + assert first_thumb.get("height") == 360 + assert second_thumb.get("path") == thumbnail_image_path( + source_url, + crawler.settings["REPUBLISHER_IMAGE_THUMBNAILS"][1], + ) + assert second_thumb.get("width") == 160 + assert second_thumb.get("height") == 160 + for thumb in thumbnails: + thumb_path = thumb.get("path") + thumb_width = thumb.get("width") + thumb_height = thumb.get("height") + thumb_image = cast( + Any, + pyvips.Image.new_from_file( + str(store_dir(normalize_pipeline) / str(thumb_path)) + ), + ) + assert (thumb_image.width, thumb_image.height) == (thumb_width, thumb_height) + + +def test_image_normalize_pipeline_cache_hit_keeps_persisted_source_path_for_extensionless_urls( + monkeypatch, tmp_path: Path +) -> None: + crawler = build_test_crawler(tmp_path) + pipeline = ImageNormalizePipeline.from_crawler(cast(Crawler, crawler)) + monkeypatch.setattr(pipeline, "inc_stats", lambda status: None) + source_url = "https://example.com/photo" + item = ElementItem( + feed_name="nasa", + el=None, + image_urls=[source_url], + images=[], + file_urls=[], + files=[], + audio_urls=[], + audios=[], + video_urls=[], + videos=[], + ) + + downloaded = pipeline.media_downloaded( + Response( + url=source_url, + body=transparent_png_bytes(), + status=200, + headers={"Content-Type": "image/png"}, + ), + Request(source_url), + spider_info(), + item=item, + ) + + uptodate = pipeline.media_to_download(Request(source_url), spider_info(), item=item) + + assert downloaded["source_path"].endswith(".png") + assert uptodate is not None + assert uptodate["source_path"] == downloaded["source_path"] + + def test_video_pipeline_media_downloaded_returns_canonical_file_info_and_variants( monkeypatch, tmp_path: Path ) -> None: diff --git a/uv.lock b/uv.lock index 857e52d..3a73346 100644 --- a/uv.lock +++ b/uv.lock @@ -812,25 +812,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" }, ] -[[package]] -name = "pillow" -version = "10.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cd/74/ad3d526f3bf7b6d3f408b73fde271ec69dfac8b81341a318ce825f2b3812/pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06", size = 46555059, upload-time = "2024-07-01T09:48:43.583Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/00/706cebe7c2c12a6318aabe5d354836f54adff7156fd9e1bd6c89f4ba0e98/pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3", size = 3525685, upload-time = "2024-07-01T09:46:45.194Z" }, - { url = "https://files.pythonhosted.org/packages/cf/76/f658cbfa49405e5ecbfb9ba42d07074ad9792031267e782d409fd8fe7c69/pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb", size = 3374883, upload-time = "2024-07-01T09:46:47.331Z" }, - { url = "https://files.pythonhosted.org/packages/46/2b/99c28c4379a85e65378211971c0b430d9c7234b1ec4d59b2668f6299e011/pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70", size = 4339837, upload-time = "2024-07-01T09:46:49.647Z" }, - { url = "https://files.pythonhosted.org/packages/f1/74/b1ec314f624c0c43711fdf0d8076f82d9d802afd58f1d62c2a86878e8615/pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be", size = 4455562, upload-time = "2024-07-01T09:46:51.811Z" }, - { url = "https://files.pythonhosted.org/packages/4a/2a/4b04157cb7b9c74372fa867096a1607e6fedad93a44deeff553ccd307868/pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0", size = 4366761, upload-time = "2024-07-01T09:46:53.961Z" }, - { url = "https://files.pythonhosted.org/packages/ac/7b/8f1d815c1a6a268fe90481232c98dd0e5fa8c75e341a75f060037bd5ceae/pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc", size = 4536767, upload-time = "2024-07-01T09:46:56.664Z" }, - { url = "https://files.pythonhosted.org/packages/e5/77/05fa64d1f45d12c22c314e7b97398ffb28ef2813a485465017b7978b3ce7/pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a", size = 4477989, upload-time = "2024-07-01T09:46:58.977Z" }, - { url = "https://files.pythonhosted.org/packages/12/63/b0397cfc2caae05c3fb2f4ed1b4fc4fc878f0243510a7a6034ca59726494/pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309", size = 4610255, upload-time = "2024-07-01T09:47:01.189Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f9/cfaa5082ca9bc4a6de66ffe1c12c2d90bf09c309a5f52b27759a596900e7/pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060", size = 2235603, upload-time = "2024-07-01T09:47:03.918Z" }, - { url = "https://files.pythonhosted.org/packages/01/6a/30ff0eef6e0c0e71e55ded56a38d4859bf9d3634a94a88743897b5f96936/pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea", size = 2554972, upload-time = "2024-07-01T09:47:06.152Z" }, - { url = "https://files.pythonhosted.org/packages/48/2c/2e0a52890f269435eee38b21c8218e102c621fe8d8df8b9dd06fabf879ba/pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d", size = 2243375, upload-time = "2024-07-01T09:47:09.065Z" }, -] - [[package]] name = "platformdirs" version = "4.9.4" @@ -1012,6 +993,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "pyvips" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/282936de9faac6addf6bc8792c18e006489d0023ffd8856b8643f54d0558/pyvips-3.1.1.tar.gz", hash = "sha256:84fe744d023b1084ac2516bb17064cacd41c7f8aabf8e524dd383534941b9301", size = 56951, upload-time = "2025-12-09T18:38:06.355Z" } + [[package]] name = "pyyaml" version = "6.0.3" @@ -1093,10 +1083,10 @@ dependencies = [ { name = "hypercorn" }, { name = "lxml" }, { name = "peewee" }, - { name = "pillow" }, { name = "prometheus-client" }, { name = "pygea" }, { name = "python-dateutil" }, + { name = "pyvips" }, { name = "quart" }, { name = "scrapy" }, ] @@ -1126,10 +1116,10 @@ requires-dist = [ { name = "hypercorn", specifier = ">=0.18.0,<0.19.0" }, { name = "lxml", specifier = ">=5.2.1,<6.0.0" }, { name = "peewee", specifier = ">=3.19.0,<4.0.0" }, - { name = "pillow", specifier = ">=10.3.0,<11.0.0" }, { name = "prometheus-client", specifier = ">=0.20.0,<0.21.0" }, { name = "pygea", git = "https://guardianproject.dev/anynews/pygea.git" }, { name = "python-dateutil", specifier = ">=2.9.0.post0,<3.0.0" }, + { name = "pyvips", specifier = ">=3.0.0,<4.0.0" }, { name = "quart", specifier = ">=0.20.0,<0.21.0" }, { name = "scrapy", specifier = ">=2.11.1,<3.0.0" }, ]