style: apply formatter

fix: publish feeds atomically
docs: document image pipeline profiles
2026-05-27 10:58:07 +02:00 · 2026-05-27 10:57:21 +02:00 · 2026-05-27 10:13:06 +02:00
14 changed files with 298 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -59,6 +59,17 @@ Operational notes:
 - Mirrored feeds are written under `out/feeds/<slug>/`.
  In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
 - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
+- Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
+  variants; the first profile is the canonical image URL used when feed image
+  URLs are rewritten.
+- Default image profiles keep source bytes under `images/source/`, write
+  full-size variants under `images/full/`, and write thumbnail profiles from
+  `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
+- Explicit item image media is exported as Media RSS image groups with named
+  thumbnails. Inline HTML images are mirrored and rewritten in content, but are
+  not promoted to item-level Media RSS.
+- Image profile names and transform settings are part of generated filenames.
+  Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
 - Job logs and stats artifacts are written under `out/logs/`.

 The legacy one-shot config-driven crawler is still available:
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
 - [x] Offlines RSS feed xml
 - [x] Downloads media and enclosures
 - [x] Rewrites media urls
- [x] Image normalization (JPG, RGB)
+- [x] Profile-driven image normalization, compression, and thumbnails
 - [x] Audio transcoding
 - [x] Video transcoding
- [ ] Image compression - Do we want this? -> DEFERED for now
 - [x] Download and rewrite media embedded in content/CDATA fields
 - [x] Config file to drive the program
 - [x] Add sqlite database and simple admin UI to replace config
--- a/demo/README.md
+++ b/demo/README.md
@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
 - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
 - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing

+## Image Profiles
+
+The demo config uses the default image profiles from `repub/settings.py`.
+`REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
+canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
+named thumbnail variants for explicit item image media.
+
+By default, mirrored image source bytes are kept under `images/source/`, full
+profile variants are written under `images/full/`, and thumbnail profile
+variants are written under `images/thumbs/` inside each feed output directory.
+Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
+when a demo run needs to disable thumbnails or test a different profile set.
+
 ## Local File Feed

 `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 LOG_LEVEL = "INFO"
 DOWNLOAD_TIMEOUT = 30
 REPUBLISHER_FEED_URL = "https://mirror.example"
+
+# Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
+# variants, and its first profile is the canonical image URL written into feeds.
+# REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
+# image media. Defaults live in repub/settings.py and generate WebP + JPEG full
+# images plus JPEG thumbnails.
+# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
+# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
--- a/repub/config.py
+++ b/repub/config.py
@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
    return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"


+def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
+    return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next"
+
+
 def _resolve_path(base_path: Path, value: str) -> Path:
    path = Path(value).expanduser()
    if not path.is_absolute():
@ -218,7 +222,7 @@ def build_feed_settings(
        {
            "REPUBLISHER_OUT_DIR": str(out_dir),
            "FEEDS": {
-                str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
+                str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
                    "format": "rss",
                    "postprocessing": [],
                    "feed_name": feed_slug,
--- a/repub/crawl.py
+++ b/repub/crawl.py
@ -15,6 +15,7 @@ from repub.config import (
    load_config,
 )
 from repub.media import check_runtime
+from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider

 logger = logging.getLogger(__name__)
@ -81,6 +82,14 @@ def run_feeds(
        deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)

        def handle_success(_: object) -> None:
+            try:
+                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            except Exception:
+                failure = Failure()
+                logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug)
+                logger.critical("%s", failure.getTraceback())
+                results.append((feed.slug, failure))
+                return None
            logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
            results.append((feed.slug, None))
            return None
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -31,6 +31,7 @@ from repub.model import (
    initialize_database,
    load_feed_url,
 )
+from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider


@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int:
        return 130

    if exit_code == 0:
+        try:
+            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+        except Exception as error:
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                flush=True,
+            )
+            return 1
        print(
            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
            flush=True,
--- a/repub/pages/sources.py
+++ b/repub/pages/sources.py
@ -381,7 +381,7 @@ def source_form(
                            ),
                            toggle_field(
                                label="Convert images",
-                                description="Normalize mirrored images through the image conversion pipeline for this source.",
+                                description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
                                signal_name="convertImages",
                                checked=_checked(source, "convert_images", True),
                            ),
--- a/repub/postprocessing.py
+++ b/repub/postprocessing.py
@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+from contextlib import suppress
+from pathlib import Path
+from xml.etree import ElementTree
+
+from repub.config import feed_output_path, staged_feed_output_path
+
+
+def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path:
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
+    public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
+
+    public_path.parent.mkdir(parents=True, exist_ok=True)
+    _validate_staged_feed(staged_path)
+    _fsync_file(staged_path)
+    os.replace(staged_path, public_path)
+    _fsync_directory(public_path.parent)
+    return public_path
+
+
+def _fsync_file(path: Path) -> None:
+    with path.open("rb") as handle:
+        os.fsync(handle.fileno())
+
+
+def _validate_staged_feed(path: Path) -> None:
+    try:
+        root = ElementTree.parse(path).getroot()
+    except ElementTree.ParseError as error:
+        raise ValueError(f"Staged feed is not well-formed XML: {path}") from error
+
+    if root.tag != "rss":
+        raise ValueError(f"Staged feed is not an RSS document: {path}")
+    if root.find("channel") is None:
+        raise ValueError(f"Staged feed is missing an RSS channel: {path}")
+
+
+def _fsync_directory(path: Path) -> None:
+    flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)
+    with suppress(OSError):
+        fd = os.open(path, flags)
+        try:
+            os.fsync(fd)
+        finally:
+            os.close(fd)
--- a/repub/settings.py
+++ b/repub/settings.py
@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
 REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
 REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"

+# Full-size image profiles. The first profile is the canonical public image
+# URL used when feed image URLs are rewritten.
 REPUBLISHER_IMAGE = [
    {
        "name": "main_webp",
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
    },
 ]

+# Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
+# image media.
 REPUBLISHER_IMAGE_THUMBNAILS = [
    {
        "name": "card_hero",
--- a/repub/utils.py
+++ b/repub/utils.py
@ -79,7 +79,7 @@ def canonical_published_image_path(
    source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
    if not profiles:
-        raise ValueError("Missing image normalization profiles")
+        raise ValueError("Missing image profiles")
    return published_image_path(source_url, profiles[0])


@ -122,7 +122,7 @@ def canonical_published_media_path(
    file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
    if not profiles:
-        raise ValueError(f"Missing transcode profiles for {file_type.value}")
+        raise ValueError(f"Missing media profiles for {file_type.value}")
    # The first configured profile is the public URL contract. Reordering profiles
    # changes published URLs for already-mirrored media.
    if file_type == FileType.IMAGE:
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
        out_dir / "feeds" / "info-marti" / "files"
    )
    assert feed_settings["FEEDS"] == {
-        str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
+        str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): {
            "format": "rss",
            "postprocessing": [],
            "feed_name": "info-marti",
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@ -2,8 +2,11 @@ from pathlib import Path

 import pytest

-from repub.config import FeedConfig
-from repub.job_runner import _build_crawl_settings
+from repub import job_runner as job_runner_module
+from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
+from repub.job_runner import JobSourceConfig, _build_crawl_settings
+
+VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'


 def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url(
            stats_path=tmp_path / "stats.jsonl",
            feed_url="",
        )
+
+
+def test_main_publishes_staged_feed_after_successful_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=0)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 0
+    assert public_path.read_text(encoding="utf-8") == VALID_FEED
+    assert not staged_path.exists()
+
+
+def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=0)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 1
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
+
+
+def test_main_does_not_publish_staged_feed_after_failed_crawl(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    _patch_worker_dependencies(monkeypatch, exit_code=1)
+
+    exit_code = job_runner_module.main(
+        [
+            "--job-id",
+            "1",
+            "--execution-id",
+            "2",
+            "--db-path",
+            str(tmp_path / "republisher.db"),
+            "--out-dir",
+            str(out_dir),
+            "--stats-path",
+            str(tmp_path / "stats.jsonl"),
+        ]
+    )
+
+    assert exit_code == 1
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == VALID_FEED
+
+
+def _patch_worker_dependencies(
+    monkeypatch: pytest.MonkeyPatch, *, exit_code: int
+) -> None:
+    monkeypatch.setattr(
+        job_runner_module,
+        "_load_job_source_config",
+        lambda *, db_path, job_id: JobSourceConfig(
+            source_name="Demo",
+            source_slug="demo",
+            source_type="feed",
+            spider_arguments={},
+            feed_url="https://source.example/feed.rss",
+        ),
+    )
+    monkeypatch.setattr(
+        job_runner_module, "load_feed_url", lambda: "https://mirror.example"
+    )
+    monkeypatch.setattr(
+        job_runner_module,
+        "CrawlerProcess",
+        lambda settings: object(),
+    )
+    monkeypatch.setattr(
+        job_runner_module,
+        "_run_crawl",
+        lambda *, process, feed, spider_arguments: exit_code,
+    )
--- a/tests/test_postprocessing.py
+++ b/tests/test_postprocessing.py
@ -0,0 +1,52 @@
+from pathlib import Path
+
+import pytest
+
+from repub.config import feed_output_path, staged_feed_output_path
+from repub.postprocessing import publish_staged_feed
+
+VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
+
+
+def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+    published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo")
+
+    assert published_path == public_path
+    assert public_path.read_text(encoding="utf-8") == VALID_FEED
+    assert not staged_path.exists()
+
+
+def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None:
+    with pytest.raises(FileNotFoundError):
+        publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing")
+
+
+@pytest.mark.parametrize(
+    "staged_feed",
+    [
+        '<rss version="2.0"/>\n',
+        '<rss version="2.0"><channel></rss>\n',
+    ],
+)
+def test_publish_staged_feed_rejects_unusable_feed(
+    tmp_path: Path, staged_feed: str
+) -> None:
+    out_dir = tmp_path / "out"
+    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+    public_path.parent.mkdir(parents=True)
+    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
+    staged_path.write_text(staged_feed, encoding="utf-8")
+
+    with pytest.raises(ValueError):
+        publish_staged_feed(out_dir=out_dir, feed_slug="demo")
+
+    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
+    assert staged_path.read_text(encoding="utf-8") == staged_feed
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
            await render_execution_logs(app, job_id=job.id, execution_id=9999)
        )
        missing_log = str(
-            await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
+            await render_execution_logs(
+                app,
+                job_id=job.id,
+                execution_id=int(execution.get_id()),
+            )
        )

        assert "Execution log unavailable" in missing_execution
Author	SHA1	Message	Date
Abel Luck	3b6503a6ed	style: apply formatter All checks were successful buildbot/nix-eval Build done. Details buildbot/nix-build Build done. Details buildbot/nix-effects Build done. Details	2026-05-27 10:58:07 +02:00
Abel Luck	e64a32d76b	fix: publish feeds atomically	2026-05-27 10:57:21 +02:00
Abel Luck	cbb427b89d	docs: document image pipeline profiles	2026-05-27 10:13:06 +02:00