From cbb427b89de53365f3ab8f7285ee37336b1fa884 Mon Sep 17 00:00:00 2001
From: Abel Luck <abel@guardianproject.info>
Date: Wed, 27 May 2026 10:13:06 +0200
Subject: [PATCH 1/3] docs: document image pipeline profiles

---
 README.md              | 14 ++++++++++++--
 demo/README.md         | 13 +++++++++++++
 demo/repub.toml        |  8 ++++++++
 repub/pages/sources.py |  2 +-
 repub/settings.py      |  4 ++++
 repub/utils.py         |  4 ++--
 6 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 213f955..cab926d 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,17 @@ Operational notes:
 - Mirrored feeds are written under `out/feeds/<slug>/`.
   In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
 - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
+- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
+  variants; the first profile is the canonical image URL used when feed image
+  URLs are rewritten.
+- Default image profiles keep source bytes under `images/source/`, write
+  full-size variants under `images/full/`, and write thumbnail profiles from
+  `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
+- Explicit item image media is exported as Media RSS image groups with named
+  thumbnails. Inline HTML images are mirrored and rewritten in content, but are
+  not promoted to item-level Media RSS.
+- Image profile names and transform settings are part of generated filenames.
+  Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
 - Job logs and stats artifacts are written under `out/logs/`.
 
 The legacy one-shot config-driven crawler is still available:
@@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
 - [x] Offlines RSS feed xml
 - [x] Downloads media and enclosures
 - [x] Rewrites media urls
-- [x] Image normalization (JPG, RGB)
+- [x] Profile-driven image normalization, compression, and thumbnails
 - [x] Audio transcoding
 - [x] Video transcoding
-- [ ] Image compression - Do we want this? -> DEFERED for now
 - [x] Download and rewrite media embedded in content/CDATA fields
 - [x] Config file to drive the program
 - [x] Add sqlite database and simple admin UI to replace config
diff --git a/demo/README.md b/demo/README.md
index 4cca777..af4f0b8 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
 - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
 - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
 
+## Image Profiles
+
+The demo config uses the default image profiles from `repub/settings.py`.
+`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
+canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
+named thumbnail variants for explicit item image media.
+
+By default, mirrored image source bytes are kept under `images/source/`, full
+profile variants are written under `images/full/`, and thumbnail profile
+variants are written under `images/thumbs/` inside each feed output directory.
+Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
+when a demo run needs to disable thumbnails or test a different profile set.
+
 ## Local File Feed
 
 `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
diff --git a/demo/repub.toml b/demo/repub.toml
index bc4ac2b..d829325 100644
--- a/demo/repub.toml
+++ b/demo/repub.toml
@@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 LOG_LEVEL = "INFO"
 DOWNLOAD_TIMEOUT = 30
 REPUBLISHER_FEED_URL = "https://mirror.example"
+
+# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
+# variants, and its first profile is the canonical image URL written into feeds.
+# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
+# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
+# images plus JPEG thumbnails.
+# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
+# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
diff --git a/repub/pages/sources.py b/repub/pages/sources.py
index 62d1e9a..fbc5377 100644
--- a/repub/pages/sources.py
+++ b/repub/pages/sources.py
@@ -381,7 +381,7 @@ def source_form(
                             ),
                             toggle_field(
                                 label="Convert images",
-                                description="Normalize mirrored images through the image conversion pipeline for this source.",
+                                description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
                                 signal_name="convertImages",
                                 checked=_checked(source, "convert_images", True),
                             ),
diff --git a/repub/settings.py b/repub/settings.py
index 5b0cfcb..ae5c5d2 100644
--- a/repub/settings.py
+++ b/repub/settings.py
@@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
 REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
 REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
 
+# Full-size image profiles. The first profile is the canonical public image
+# URL used when feed image URLs are rewritten.
 REPUBLISHER_IMAGE = [
     {
         "name": "main_webp",
@@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
     },
 ]
 
+# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
+# image media.
 REPUBLISHER_IMAGE_THUMBNAILS = [
     {
         "name": "card_hero",
diff --git a/repub/utils.py b/repub/utils.py
index b443053..a7f2ef9 100644
--- a/repub/utils.py
+++ b/repub/utils.py
@@ -79,7 +79,7 @@ def canonical_published_image_path(
     source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
     if not profiles:
-        raise ValueError("Missing image normalization profiles")
+        raise ValueError("Missing image profiles")
     return published_image_path(source_url, profiles[0])
 
 
@@ -122,7 +122,7 @@ def canonical_published_media_path(
     file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
     if not profiles:
-        raise ValueError(f"Missing transcode profiles for {file_type.value}")
+        raise ValueError(f"Missing media profiles for {file_type.value}")
     # The first configured profile is the public URL contract. Reordering profiles
     # changes published URLs for already-mirrored media.
     if file_type == FileType.IMAGE:

From e64a32d76b6e12fa4d21db13b41315e55ad8cd43 Mon Sep 17 00:00:00 2001
From: Abel Luck <abel@guardianproject.info>
Date: Wed, 27 May 2026 10:57:21 +0200
Subject: [PATCH 2/3] fix: publish feeds atomically

---
 repub/config.py              |   6 +-
 repub/crawl.py               |   9 +++
 repub/job_runner.py          |   9 +++
 repub/postprocessing.py      |  47 +++++++++++++
 tests/test_config.py         |   2 +-
 tests/test_job_runner.py     | 132 ++++++++++++++++++++++++++++++++++-
 tests/test_postprocessing.py |  52 ++++++++++++++
 7 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_postprocessing.py

diff --git a/repub/config.py b/repub/config.py
index d17c7d7..459e6b2 100644
--- a/repub/config.py
+++ b/repub/config.py
@@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
     return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
 
 
+def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
+    return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next"
+
+
 def _resolve_path(base_path: Path, value: str) -> Path:
     path = Path(value).expanduser()
     if not path.is_absolute():
@@ -218,7 +222,7 @@ def build_feed_settings(
         {
             "REPUBLISHER_OUT_DIR": str(out_dir),
             "FEEDS": {
-                str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
+                str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
                     "format": "rss",
                     "postprocessing": [],
                     "feed_name": feed_slug,
diff --git a/repub/crawl.py b/repub/crawl.py
index afa789f..6f0d9f4 100644
--- a/repub/crawl.py
+++ b/repub/crawl.py
@@ -15,6 +15,7 @@ from repub.config import (
     load_config,
 )
 from repub.media import check_runtime
+from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider
 
 logger = logging.getLogger(__name__)
@@ -81,6 +82,14 @@ def run_feeds(
         deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
 
         def handle_success(_: object) -> None:
+            try:
+                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            except Exception:
+                failure = Failure()
+                logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug)
+                logger.critical("%s", failure.getTraceback())
+                results.append((feed.slug, failure))
+                return None
             logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
             results.append((feed.slug, None))
             return None
diff --git a/repub/job_runner.py b/repub/job_runner.py
index 68b3be1..008bd15 100644
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@@ -31,6 +31,7 @@ from repub.model import (
     initialize_database,
     load_feed_url,
 )
+from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider
 
 
@@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int:
         return 130
 
     if exit_code == 0:
+        try:
+            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+        except Exception as error:
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                flush=True,
+            )
+            return 1
         print(
             f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
             flush=True,
diff --git a/repub/postprocessing.py b/repub/postprocessing.py
index e69de29..984c92a 100644
--- a/repub/postprocessing.py
+++ b/repub/postprocessing.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+from contextlib import suppress
+from pathlib import Path
+from xml.etree import ElementTree
+
+from repub.config import feed_output_path, staged_feed_output_path
+
+
+def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path:
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
+    public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
+
+    public_path.parent.mkdir(parents=True, exist_ok=True)
+    _validate_staged_feed(staged_path)
+    _fsync_file(staged_path)
+    os.replace(staged_path, public_path)
+    _fsync_directory(public_path.parent)
+    return public_path
+
+
+def _fsync_file(path: Path) -> None:
+    with path.open("rb") as handle:
+        os.fsync(handle.fileno())
+
+
+def _validate_staged_feed(path: Path) -> None:
+    try:
+        root = ElementTree.parse(path).getroot()
+    except ElementTree.ParseError as error:
+        raise ValueError(f"Staged feed is not well-formed XML: {path}") from error
+
+    if root.tag != "rss":
+        raise ValueError(f"Staged feed is not an RSS document: {path}")
+    if root.find("channel") is None:
+        raise ValueError(f"Staged feed is missing an RSS channel: {path}")
+
+
+def _fsync_directory(path: Path) -> None:
+    flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)
+    with suppress(OSError):
+        fd = os.open(path, flags)
+        try:
+            os.fsync(fd)
+        finally:
+            os.close(fd)
diff --git a/tests/test_config.py b/tests/test_config.py
index 1d5816b..517dc91 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
         out_dir / "feeds" / "info-marti" / "files"
     )
     assert feed_settings["FEEDS"] == {
-        str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
+        str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): {
             "format": "rss",
             "postprocessing": [],
             "feed_name": "info-marti",
diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py
index d7fa936..712a540 100644
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@@ -2,8 +2,11 @@ from pathlib import Path
 
 import pytest
 
-from repub.config import FeedConfig
-from repub.job_runner import _build_crawl_settings
+from repub import job_runner as job_runner_module
+from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
+from repub.job_runner import JobSourceConfig, _build_crawl_settings
+
+VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
 
 
 def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
@@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url(
             stats_path=tmp_path / "stats.jsonl",
             feed_url="",
         )
+
+
+def test_main_publishes_staged_feed_after_successful_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=0)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 0
+    assert public_path.read_text(encoding="utf-8") == VALID_FEED
+    assert not staged_path.exists()
+
+
+def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=0)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 1
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
+
+
+def test_main_does_not_publish_staged_feed_after_failed_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=1)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 1
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == VALID_FEED
+
+
+def _patch_worker_dependencies(
+    monkeypatch: pytest.MonkeyPatch, *, exit_code: int
+) -> None:
+    monkeypatch.setattr(
+        job_runner_module,
+        "_load_job_source_config",
+        lambda *, db_path, job_id: JobSourceConfig(
+            source_name="Demo",
+            source_slug="demo",
+            source_type="feed",
+            spider_arguments={},
+            feed_url="https://source.example/feed.rss",
+        ),
+    )
+    monkeypatch.setattr(
+        job_runner_module, "load_feed_url", lambda: "https://mirror.example"
+    )
+    monkeypatch.setattr(
+        job_runner_module,
+        "CrawlerProcess",
+        lambda settings: object(),
+    )
+    monkeypatch.setattr(
+        job_runner_module,
+        "_run_crawl",
+        lambda *, process, feed, spider_arguments: exit_code,
+    )
diff --git a/tests/test_postprocessing.py b/tests/test_postprocessing.py
new file mode 100644
index 0000000..77a221a
--- /dev/null
+++ b/tests/test_postprocessing.py
@@ -0,0 +1,52 @@
+from pathlib import Path
+
+import pytest
+
+from repub.config import feed_output_path, staged_feed_output_path
+from repub.postprocessing import publish_staged_feed
+
+VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
+
+
+def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo")
+
+    assert published_path == public_path
+    assert public_path.read_text(encoding="utf-8") == VALID_FEED
+    assert not staged_path.exists()
+
+
+def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None:
+    with pytest.raises(FileNotFoundError):
+        publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing")
+
+
+@pytest.mark.parametrize(
+    "staged_feed",
+    [
+        '<rss version="2.0"/>\n',
+        '<rss version="2.0"><channel></rss>\n',
+    ],
+)
+def test_publish_staged_feed_rejects_unusable_feed(
+    tmp_path: Path, staged_feed: str
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(staged_feed, encoding="utf-8")
+
+    with pytest.raises(ValueError):
+        publish_staged_feed(out_dir=out_dir, feed_slug="demo")
+
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == staged_feed

From 3b6503a6ede21ae6596a350e3840f862841f9bd7 Mon Sep 17 00:00:00 2001
From: Abel Luck <abel@guardianproject.info>
Date: Wed, 27 May 2026 10:58:07 +0200
Subject: [PATCH 3/3] style: apply formatter

---
 tests/test_scheduler_runtime.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_scheduler_runtime.py b/tests/test_scheduler_runtime.py
index def747c..362db11 100644
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
             await render_execution_logs(app, job_id=job.id, execution_id=9999)
         )
         missing_log = str(
-            await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
+            await render_execution_logs(
+                app,
+                job_id=job.id,
+                execution_id=int(execution.get_id()),
+            )
         )
 
         assert "Execution log unavailable" in missing_execution