style: apply formatter

fix: publish feeds atomically
docs: document image pipeline profiles
2026-05-27 10:58:07 +02:00 · 2026-05-27 10:57:21 +02:00 · 2026-05-27 10:13:06 +02:00
14 changed files with 298 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -59,6 +59,17 @@ Operational notes:
 - Mirrored feeds are written under `out/feeds/<slug>/`.
  In production, expose `out/feeds/` directly from the reverse proxy at `/feeds/`.
 - `Feed URL` is used to generate absolute media URLs and `atom:link rel="self"` in exported feeds.
 - Image output is profile-driven. `REPUBLISHER_IMAGE` defines full-size
  variants; the first profile is the canonical image URL used when feed image
  URLs are rewritten.
 - Default image profiles keep source bytes under `images/source/`, write
  full-size variants under `images/full/`, and write thumbnail profiles from
  `REPUBLISHER_IMAGE_THUMBNAILS` under `images/thumbs/`.
 - Explicit item image media is exported as Media RSS image groups with named
  thumbnails. Inline HTML images are mirrored and rewritten in content, but are
  not promoted to item-level Media RSS.
 - Image profile names and transform settings are part of generated filenames.
  Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
 - Job logs and stats artifacts are written under `out/logs/`.
 The legacy one-shot config-driven crawler is still available:
@ -79,10 +90,9 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
 - [x] Offlines RSS feed xml
 - [x] Downloads media and enclosures
 - [x] Rewrites media urls
- [x] Image normalization (JPG, RGB)
+- [x] Profile-driven image normalization, compression, and thumbnails
 - [x] Audio transcoding
 - [x] Video transcoding
 - [ ] Image compression - Do we want this? -> DEFERED for now
 - [x] Download and rewrite media embedded in content/CDATA fields
 - [x] Config file to drive the program
 - [x] Add sqlite database and simple admin UI to replace config
--- a/demo/README.md
+++ b/demo/README.md
@ -17,6 +17,19 @@ Because `out_dir` in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/
 - `repub.toml`: example runtime config with feed definitions, slugs, and Scrapy overrides
 - `fixtures/local-feed.rss`: simple local RSS fixture for `file://` feed testing
 ## Image Profiles
 The demo config uses the default image profiles from `repub/settings.py`.
 `REPUBLISHER_IMAGE` controls full-size image variants; the first profile is the
 canonical image URL written into feeds. `REPUBLISHER_IMAGE_THUMBNAILS` controls
 named thumbnail variants for explicit item image media.
 By default, mirrored image source bytes are kept under `images/source/`, full
 profile variants are written under `images/full/`, and thumbnail profile
 variants are written under `images/thumbs/` inside each feed output directory.
 Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
 when a demo run needs to disable thumbnails or test a different profile set.
 ## Local File Feed
 `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -14,3 +14,11 @@ url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
 LOG_LEVEL = "INFO"
 DOWNLOAD_TIMEOUT = 30
 REPUBLISHER_FEED_URL = "https://mirror.example"
 # Image mirroring is profile-driven. REPUBLISHER_IMAGE controls full-size
 # variants, and its first profile is the canonical image URL written into feeds.
 # REPUBLISHER_IMAGE_THUMBNAILS controls named thumbnails for explicit item
 # image media. Defaults live in repub/settings.py and generate WebP + JPEG full
 # images plus JPEG thumbnails.
 # REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
 # REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
--- a/repub/config.py
+++ b/repub/config.py
@ -38,6 +38,10 @@ def feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
    return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / "feed.rss"
 def staged_feed_output_path(*, out_dir: Path, feed_slug: str) -> Path:
    return feed_output_dir(out_dir=out_dir, feed_slug=feed_slug) / ".feed.rss.next"
 def _resolve_path(base_path: Path, value: str) -> Path:
    path = Path(value).expanduser()
    if not path.is_absolute():
@ -218,7 +222,7 @@ def build_feed_settings(
        {
            "REPUBLISHER_OUT_DIR": str(out_dir),
            "FEEDS": {
-                str(feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
+                str(staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)): {
                    "format": "rss",
                    "postprocessing": [],
                    "feed_name": feed_slug,
--- a/repub/crawl.py
+++ b/repub/crawl.py
@ -15,6 +15,7 @@ from repub.config import (
    load_config,
 )
 from repub.media import check_runtime
 from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider
 logger = logging.getLogger(__name__)
@ -81,6 +82,14 @@ def run_feeds(
        deferred = process.crawl(crawler, feed_name=feed.slug, url=feed.url)
        def handle_success(_: object) -> None:
            try:
                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
            except Exception:
                failure = Failure()
                logger.error("Feed %s (%s) failed to publish", feed.name, feed.slug)
                logger.critical("%s", failure.getTraceback())
                results.append((feed.slug, failure))
                return None
            logger.info("Feed %s (%s) completed successfully", feed.name, feed.slug)
            results.append((feed.slug, None))
            return None
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -31,6 +31,7 @@ from repub.model import (
    initialize_database,
    load_feed_url,
 )
 from repub.postprocessing import publish_staged_feed
 from repub.spiders.rss_spider import RssFeedSpider
@ -299,6 +300,14 @@ def main(argv: list[str] | None = None) -> int:
        return 130
    if exit_code == 0:
        try:
            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
        except Exception as error:
            print(
                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
                flush=True,
            )
            return 1
        print(
            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
            flush=True,
--- a/repub/pages/sources.py
+++ b/repub/pages/sources.py
@ -381,7 +381,7 @@ def source_form(
                            ),
                            toggle_field(
                                label="Convert images",
-                                description="Normalize mirrored images through the image conversion pipeline for this source.",
+                                description="Run mirrored images through configured image profiles and thumbnail profiles for this source.",
                                signal_name="convertImages",
                                checked=_checked(source, "convert_images", True),
                            ),
--- a/repub/postprocessing.py
+++ b/repub/postprocessing.py
@ -0,0 +1,47 @@
 from __future__ import annotations
 import os
 from contextlib import suppress
 from pathlib import Path
 from xml.etree import ElementTree
 from repub.config import feed_output_path, staged_feed_output_path
 def publish_staged_feed(*, out_dir: Path, feed_slug: str) -> Path:
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
    public_path = feed_output_path(out_dir=out_dir, feed_slug=feed_slug)
    public_path.parent.mkdir(parents=True, exist_ok=True)
    _validate_staged_feed(staged_path)
    _fsync_file(staged_path)
    os.replace(staged_path, public_path)
    _fsync_directory(public_path.parent)
    return public_path
 def _fsync_file(path: Path) -> None:
    with path.open("rb") as handle:
        os.fsync(handle.fileno())
 def _validate_staged_feed(path: Path) -> None:
    try:
        root = ElementTree.parse(path).getroot()
    except ElementTree.ParseError as error:
        raise ValueError(f"Staged feed is not well-formed XML: {path}") from error
    if root.tag != "rss":
        raise ValueError(f"Staged feed is not an RSS document: {path}")
    if root.find("channel") is None:
        raise ValueError(f"Staged feed is missing an RSS channel: {path}")
 def _fsync_directory(path: Path) -> None:
    flags = os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)
    with suppress(OSError):
        fd = os.open(path, flags)
        try:
            os.fsync(fd)
        finally:
            os.close(fd)
--- a/repub/settings.py
+++ b/repub/settings.py
@ -108,6 +108,8 @@ REPUBLISHER_IMAGE_FULL_SUBDIR = "full"
 REPUBLISHER_IMAGE_SOURCE_SUBDIR = "source"
 REPUBLISHER_IMAGE_THUMBNAIL_SUBDIR = "thumbs"
 # Full-size image profiles. The first profile is the canonical public image
 # URL used when feed image URLs are rewritten.
 REPUBLISHER_IMAGE = [
    {
        "name": "main_webp",
@ -159,6 +161,8 @@ REPUBLISHER_IMAGE = [
    },
 ]
 # Named thumbnail profiles emitted as Media RSS thumbnails for explicit item
 # image media.
 REPUBLISHER_IMAGE_THUMBNAILS = [
    {
        "name": "card_hero",
--- a/repub/utils.py
+++ b/repub/utils.py
@ -79,7 +79,7 @@ def canonical_published_image_path(
    source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
    if not profiles:
-        raise ValueError("Missing image normalization profiles")
+        raise ValueError("Missing image profiles")
    return published_image_path(source_url, profiles[0])
@ -122,7 +122,7 @@ def canonical_published_media_path(
    file_type: FileType, source_url: str, profiles: Sequence[Mapping[str, Any]]
 ) -> str:
    if not profiles:
-        raise ValueError(f"Missing transcode profiles for {file_type.value}")
+        raise ValueError(f"Missing media profiles for {file_type.value}")
    # The first configured profile is the public URL contract. Reordering profiles
    # changes published URLs for already-mirrored media.
    if file_type == FileType.IMAGE:
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -154,7 +154,7 @@ def test_build_feed_settings_derives_output_paths_from_feed_slug(
        out_dir / "feeds" / "info-marti" / "files"
    )
    assert feed_settings["FEEDS"] == {
-        str(out_dir / "feeds" / "info-marti" / "feed.rss"): {
+        str(out_dir / "feeds" / "info-marti" / ".feed.rss.next"): {
            "format": "rss",
            "postprocessing": [],
            "feed_name": "info-marti",
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@ -2,8 +2,11 @@ from pathlib import Path
 import pytest
-from repub.config import FeedConfig
+from repub import job_runner as job_runner_module
-from repub.job_runner import _build_crawl_settings
+from repub.config import FeedConfig, feed_output_path, staged_feed_output_path
 from repub.job_runner import JobSourceConfig, _build_crawl_settings
 VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
 def test_build_crawl_settings_passes_feed_url_to_spider(tmp_path: Path) -> None:
@ -35,3 +38,128 @@ def test_build_crawl_settings_requires_non_empty_feed_url(
            stats_path=tmp_path / "stats.jsonl",
            feed_url="",
        )
 def test_main_publishes_staged_feed_after_successful_crawl(
    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text(VALID_FEED, encoding="utf-8")
    _patch_worker_dependencies(monkeypatch, exit_code=0)
    exit_code = job_runner_module.main(
        [
            "--job-id",
            "1",
            "--execution-id",
            "2",
            "--db-path",
            str(tmp_path / "republisher.db"),
            "--out-dir",
            str(out_dir),
            "--stats-path",
            str(tmp_path / "stats.jsonl"),
        ]
    )
    assert exit_code == 0
    assert public_path.read_text(encoding="utf-8") == VALID_FEED
    assert not staged_path.exists()
 def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text('<rss version="2.0"/>\n', encoding="utf-8")
    _patch_worker_dependencies(monkeypatch, exit_code=0)
    exit_code = job_runner_module.main(
        [
            "--job-id",
            "1",
            "--execution-id",
            "2",
            "--db-path",
            str(tmp_path / "republisher.db"),
            "--out-dir",
            str(out_dir),
            "--stats-path",
            str(tmp_path / "stats.jsonl"),
        ]
    )
    assert exit_code == 1
    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
    assert staged_path.read_text(encoding="utf-8") == '<rss version="2.0"/>\n'
 def test_main_does_not_publish_staged_feed_after_failed_crawl(
    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text(VALID_FEED, encoding="utf-8")
    _patch_worker_dependencies(monkeypatch, exit_code=1)
    exit_code = job_runner_module.main(
        [
            "--job-id",
            "1",
            "--execution-id",
            "2",
            "--db-path",
            str(tmp_path / "republisher.db"),
            "--out-dir",
            str(out_dir),
            "--stats-path",
            str(tmp_path / "stats.jsonl"),
        ]
    )
    assert exit_code == 1
    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
    assert staged_path.read_text(encoding="utf-8") == VALID_FEED
 def _patch_worker_dependencies(
    monkeypatch: pytest.MonkeyPatch, *, exit_code: int
 ) -> None:
    monkeypatch.setattr(
        job_runner_module,
        "_load_job_source_config",
        lambda *, db_path, job_id: JobSourceConfig(
            source_name="Demo",
            source_slug="demo",
            source_type="feed",
            spider_arguments={},
            feed_url="https://source.example/feed.rss",
        ),
    )
    monkeypatch.setattr(
        job_runner_module, "load_feed_url", lambda: "https://mirror.example"
    )
    monkeypatch.setattr(
        job_runner_module,
        "CrawlerProcess",
        lambda settings: object(),
    )
    monkeypatch.setattr(
        job_runner_module,
        "_run_crawl",
        lambda *, process, feed, spider_arguments: exit_code,
    )
--- a/tests/test_postprocessing.py
+++ b/tests/test_postprocessing.py
@ -0,0 +1,52 @@
 from pathlib import Path
 import pytest
 from repub.config import feed_output_path, staged_feed_output_path
 from repub.postprocessing import publish_staged_feed
 VALID_FEED = '<rss version="2.0"><channel><title>new</title></channel></rss>\n'
 def test_publish_staged_feed_replaces_public_feed(tmp_path: Path) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text(VALID_FEED, encoding="utf-8")
    published_path = publish_staged_feed(out_dir=out_dir, feed_slug="demo")
    assert published_path == public_path
    assert public_path.read_text(encoding="utf-8") == VALID_FEED
    assert not staged_path.exists()
 def test_publish_staged_feed_requires_staged_file(tmp_path: Path) -> None:
    with pytest.raises(FileNotFoundError):
        publish_staged_feed(out_dir=tmp_path / "out", feed_slug="missing")
@pytest.mark.parametrize(
    "staged_feed",
    [
        '<rss version="2.0"/>\n',
        '<rss version="2.0"><channel></rss>\n',
    ],
 )
 def test_publish_staged_feed_rejects_unusable_feed(
    tmp_path: Path, staged_feed: str
 ) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text(staged_feed, encoding="utf-8")
    with pytest.raises(ValueError):
        publish_staged_feed(out_dir=out_dir, feed_slug="demo")
    assert public_path.read_text(encoding="utf-8") == "<rss>old</rss>\n"
    assert staged_path.read_text(encoding="utf-8") == staged_feed
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@ -1088,7 +1088,11 @@ def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
            await render_execution_logs(app, job_id=job.id, execution_id=9999)
        )
        missing_log = str(
-            await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
+            await render_execution_logs(
                app,
                job_id=job.id,
                execution_id=int(execution.get_id()),
            )
        )
        assert "Execution log unavailable" in missing_execution
Author	SHA1	Message	Date
Abel Luck	3b6503a6ed	style: apply formatter All checks were successful buildbot/nix-eval Build done. Details buildbot/nix-build Build done. Details buildbot/nix-effects Build done. Details	2026-05-27 10:58:07 +02:00
Abel Luck	e64a32d76b	fix: publish feeds atomically	2026-05-27 10:57:21 +02:00
Abel Luck	cbb427b89d	docs: document image pipeline profiles	2026-05-27 10:13:06 +02:00