docs: document image pipeline profiles

This commit is contained in:
Abel Luck 2026-05-27 10:13:06 +02:00
parent 18a7f652d4
commit cbb427b89d
6 changed files with 40 additions and 5 deletions

View file

@ -59,6 +59,17 @@ Operational notes:
- Mirrored feeds are written under `out/feeds/<slug>/`. - Mirrored feeds are written under `out/feeds/<slug>/`.
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`. In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds. - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
variants; the first profile is the canonical image URL used when feed image
URLs are rewritten.
- Default image profiles keep source bytes under `images/source/`, write
full-size variants under `images/full/`, and write thumbnail profiles from
`REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
- Explicit item image media is exported as Media RSS image groups with named
thumbnails. Inline HTML images are mirrored and rewritten in content, but are
not promoted to item-level Media RSS.
- Image profile names and transform settings are part of generated filenames.
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
- Job logs and stats artifacts are written under `out/logs/`. - Job logs and stats artifacts are written under `out/logs/`.
The legacy one-shot config-driven crawler is still available: The legacy one-shot config-driven crawler is still available:
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
- [x] Offlines RSS feed xml - [x] Offlines RSS feed xml
- [x] Downloads media and enclosures - [x] Downloads media and enclosures
- [x] Rewrites media urls - [x] Rewrites media urls
- [x] Image normalization (JPG, RGB) - [x] Profile-driven image normalization, compression, and thumbnails
- [x] Audio transcoding - [x] Audio transcoding
- [x] Video transcoding - [x] Video transcoding
- [ ] Image compression - Do we want this? -> DEFERED for now
- [x] Download and rewrite media embedded in content/CDATA fields - [x] Download and rewrite media embedded in content/CDATA fields
- [x] Config file to drive the program - [x] Config file to drive the program
- [x] Add sqlite database and simple admin UI to replace config - [x] Add sqlite database and simple admin UI to replace config

View file

@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
- `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
## Image Profiles
The demo config uses the default image profiles from `repub/settings.py`.
`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
named thumbnail variants for explicit item image media.
By default, mirrored image source bytes are kept under `images/source/`, full
profile variants are written under `images/full/`, and thumbnail profile
variants are written under `images/thumbs/` inside each feed output directory.
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
when a demo run needs to disable thumbnails or test a different profile set.
## Local File Feed ## Local File Feed
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:

View file

@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
LOG_LEVEL = "INFO" LOG_LEVEL = "INFO"
DOWNLOAD_TIMEOUT = 30 DOWNLOAD_TIMEOUT = 30
REPUBLISHER_FEED_URL = "https://mirror.example" REPUBLISHER_FEED_URL = "https://mirror.example"
# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
# variants, and its first profile is the canonical image URL written into feeds.
# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
# images plus JPEG thumbnails.
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true

View file

@ -381,7 +381,7 @@ def source_form(
), ),
toggle_field( toggle_field(
label="Convert images", label="Convert images",
description="Normalize mirrored images through the image conversion pipeline for this source.", description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
signal_name="convertImages", signal_name="convertImages",
checked=_checked(source, "convert_images", True), checked=_checked(source, "convert_images", True),
), ),

View file

@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source" REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs" REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
# Full-size image profiles. The first profile is the canonical public image
# URL used when feed image URLs are rewritten.
REPUBLISHER_IMAGE = [ REPUBLISHER_IMAGE = [
{ {
"name": "main_webp", "name": "main_webp",
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
}, },
] ]
# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
# image media.
REPUBLISHER_IMAGE_THUMBNAILS = [ REPUBLISHER_IMAGE_THUMBNAILS = [
{ {
"name": "card_hero", "name": "card_hero",

View file

@ -79,7 +79,7 @@ def canonical_published_image_path(
source_url: str, profiles: Sequence[Mapping[str, Any]] source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str: ) -> str:
if not profiles: if not profiles:
raise ValueError("Missing image normalization profiles") raise ValueError("Missing image profiles")
return published_image_path(source_url, profiles[0]) return published_image_path(source_url, profiles[0])
@ -122,7 +122,7 @@ def canonical_published_media_path(
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]] file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
) -> str: ) -> str:
if not profiles: if not profiles:
raise ValueError(f"Missing transcode profiles for {file_type.value}") raise ValueError(f"Missing media profiles for {file_type.value}")
# The first configured profile is the public URL contract. Reordering profiles # The first configured profile is the public URL contract. Reordering profiles
# changes published URLs for already-mirrored media. # changes published URLs for already-mirrored media.
if file_type == FileType.IMAGE: if file_type == FileType.IMAGE: