From cbb427b89de53365f3ab8f7285ee37336b1fa884 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 10:13:06 +0200 Subject: [PATCH] docs: document image pipeline profiles --- README.md | 14 ++++++++++++-- demo/README.md | 13 +++++++++++++ demo/repub.toml | 8 ++++++++ repub/pages/sources.py | 2 +- repub/settings.py | 4 ++++ repub/utils.py | 4 ++-- 6 files changed, 40 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 213f955..cab926d 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,17 @@ Operational notes: - Mirrored feeds are written under `out/feeds//`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. +- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size + variants; the first profile is the canonical image URL used when feed image + URLs are rewritten. +- Default image profiles keep source bytes under `images/source/`, write + full-size variants under `images/full/`, and write thumbnail profiles from + `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`. +- Explicit item image media is exported as Media RSS image groups with named + thumbnails. Inline HTML images are mirrored and rewritten in content, but are + not promoted to item-level Media RSS. +- Image profile names and transform settings are part of generated filenames. + Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs. - Job logs and stats artifacts are written under `out/logs/`. The legacy one-shot config-driven crawler is still available: @@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example" - [x] Offlines RSS feed xml - [x] Downloads media and enclosures - [x] Rewrites media urls -- [x] Image normalization (JPG, RGB) +- [x] Profile-driven image normalization, compression, and thumbnails - [x] Audio transcoding - [x] Video transcoding -- [ ] Image compression - Do we want this? -> DEFERED for now - [x] Download and rewrite media embedded in content/CDATA fields - [x] Config file to drive the program - [x] Add sqlite database and simple admin UI to replace config diff --git a/demo/README.md b/demo/README.md index 4cca777..af4f0b8 100644 --- a/demo/README.md +++ b/demo/README.md @@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/ - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing +## Image Profiles + +The demo config uses the default image profiles from `repub/settings.py`. +`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the +canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls +named thumbnail variants for explicit item image media. + +By default, mirrored image source bytes are kept under `images/source/`, full +profile variants are written under `images/full/`, and thumbnail profile +variants are written under `images/thumbs/` inside each feed output directory. +Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) +when a demo run needs to disable thumbnails or test a different profile set. + ## Local File Feed `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: diff --git a/demo/repub.toml b/demo/repub.toml index bc4ac2b..d829325 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss" LOG_LEVEL = "INFO" DOWNLOAD_TIMEOUT = 30 REPUBLISHER_FEED_URL = "https://mirror.example" + +# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size +# variants, and its first profile is the canonical image URL written into feeds. +# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item +# image media. Defaults live in repub/settings.py and generate WebP + JPEG full +# images plus JPEG thumbnails. +# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true +# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true diff --git a/repub/pages/sources.py b/repub/pages/sources.py index 62d1e9a..fbc5377 100644 --- a/repub/pages/sources.py +++ b/repub/pages/sources.py @@ -381,7 +381,7 @@ def source_form( ), toggle_field( label="Convert images", - description="Normalize mirrored images through the image conversion pipeline for this source.", + description="Run mirrored images through configured image profiles and thumbnail profiles for this source.", signal_name="convertImages", checked=_checked(source, "convert_images", True), ), diff --git a/repub/settings.py b/repub/settings.py index 5b0cfcb..ae5c5d2 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full" REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" +# Full-size image profiles. The first profile is the canonical public image +# URL used when feed image URLs are rewritten. REPUBLISHER_IMAGE = [ { "name": "main_webp", @@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [ }, ] +# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item +# image media. REPUBLISHER_IMAGE_THUMBNAILS = [ { "name": "card_hero", diff --git a/repub/utils.py b/repub/utils.py index b443053..a7f2ef9 100644 --- a/repub/utils.py +++ b/repub/utils.py @@ -79,7 +79,7 @@ def canonical_published_image_path( source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError("Missing image normalization profiles") + raise ValueError("Missing image profiles") return published_image_path(source_url, profiles[0]) @@ -122,7 +122,7 @@ def canonical_published_media_path( file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] ) -> str: if not profiles: - raise ValueError(f"Missing transcode profiles for {file_type.value}") + raise ValueError(f"Missing media profiles for {file_type.value}") # The first configured profile is the public URL contract. Reordering profiles # changes published URLs for already-mirrored media. if file_type == FileType.IMAGE: