docs: document image pipeline profiles
This commit is contained in:
parent
18a7f652d4
commit
cbb427b89d
6 changed files with 40 additions and 5 deletions
14
README.md
14
README.md
|
|
@ -59,6 +59,17 @@ Operational notes:
|
|||
- Mirrored feeds are written under `out/feeds/<slug>/`.
|
||||
In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
|
||||
- `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
|
||||
- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
|
||||
variants; the first profile is the canonical image URL used when feed image
|
||||
URLs are rewritten.
|
||||
- Default image profiles keep source bytes under `images/source/`, write
|
||||
full-size variants under `images/full/`, and write thumbnail profiles from
|
||||
`REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
|
||||
- Explicit item image media is exported as Media RSS image groups with named
|
||||
thumbnails. Inline HTML images are mirrored and rewritten in content, but are
|
||||
not promoted to item-level Media RSS.
|
||||
- Image profile names and transform settings are part of generated filenames.
|
||||
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
|
||||
- Job logs and stats artifacts are written under `out/logs/`.
|
||||
|
||||
The legacy one-shot config-driven crawler is still available:
|
||||
|
|
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
|
|||
- [x] Offlines RSS feed xml
|
||||
- [x] Downloads media and enclosures
|
||||
- [x] Rewrites media urls
|
||||
- [x] Image normalization (JPG, RGB)
|
||||
- [x] Profile-driven image normalization, compression, and thumbnails
|
||||
- [x] Audio transcoding
|
||||
- [x] Video transcoding
|
||||
- [ ] Image compression - Do we want this? -> DEFERED for now
|
||||
- [x] Download and rewrite media embedded in content/CDATA fields
|
||||
- [x] Config file to drive the program
|
||||
- [x] Add sqlite database and simple admin UI to replace config
|
||||
|
|
|
|||
|
|
@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
|
|||
- `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
|
||||
- `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
|
||||
|
||||
## Image Profiles
|
||||
|
||||
The demo config uses the default image profiles from `repub/settings.py`.
|
||||
`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
|
||||
canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
|
||||
named thumbnail variants for explicit item image media.
|
||||
|
||||
By default, mirrored image source bytes are kept under `images/source/`, full
|
||||
profile variants are written under `images/full/`, and thumbnail profile
|
||||
variants are written under `images/thumbs/` inside each feed output directory.
|
||||
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
|
||||
when a demo run needs to disable thumbnails or test a different profile set.
|
||||
|
||||
## Local File Feed
|
||||
|
||||
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
|
||||
|
|
|
|||
|
|
@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
|
|||
LOG_LEVEL = "INFO"
|
||||
DOWNLOAD_TIMEOUT = 30
|
||||
REPUBLISHER_FEED_URL = "https://mirror.example"
|
||||
|
||||
# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
|
||||
# variants, and its first profile is the canonical image URL written into feeds.
|
||||
# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
|
||||
# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
|
||||
# images plus JPEG thumbnails.
|
||||
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
|
||||
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
|
||||
|
|
|
|||
|
|
@ -381,7 +381,7 @@ def source_form(
|
|||
),
|
||||
toggle_field(
|
||||
label="Convert images",
|
||||
description="Normalize mirrored images through the image conversion pipeline for this source.",
|
||||
description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
|
||||
signal_name="convertImages",
|
||||
checked=_checked(source, "convert_images", True),
|
||||
),
|
||||
|
|
|
|||
|
|
@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
|
|||
REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
|
||||
REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
|
||||
|
||||
# Full-size image profiles. The first profile is the canonical public image
|
||||
# URL used when feed image URLs are rewritten.
|
||||
REPUBLISHER_IMAGE = [
|
||||
{
|
||||
"name": "main_webp",
|
||||
|
|
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
|
|||
},
|
||||
]
|
||||
|
||||
# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
|
||||
# image media.
|
||||
REPUBLISHER_IMAGE_THUMBNAILS = [
|
||||
{
|
||||
"name": "card_hero",
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ def canonical_published_image_path(
|
|||
source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError("Missing image normalization profiles")
|
||||
raise ValueError("Missing image profiles")
|
||||
return published_image_path(source_url, profiles[0])
|
||||
|
||||
|
||||
|
|
@ -122,7 +122,7 @@ def canonical_published_media_path(
|
|||
file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
|
||||
) -> str:
|
||||
if not profiles:
|
||||
raise ValueError(f"Missing transcode profiles for {file_type.value}")
|
||||
raise ValueError(f"Missing media profiles for {file_type.value}")
|
||||
# The first configured profile is the public URL contract. Reordering profiles
|
||||
# changes published URLs for already-mirrored media.
|
||||
if file_type == FileType.IMAGE:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue