Add media retention cleanup command

2026-05-27 13:04:47 +02:00 · 2026-05-27 13:04:47 +02:00 · 507074b80e
commit 507074b80e
parent 3b6503a6ed
10 changed files with 722 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -72,6 +72,27 @@ Operational notes:
  Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
 - Job logs and stats artifacts are written under `out/logs/`.
 Media cleanup:
 - Published media can outlive the current feed when articles fall out of the
  feed window. Use `cleanup-media` to delete old media files that are no longer
  referenced by the latest published `feed.rss`.
 - The default retention window is 25 days. Run a dry run first:
 ```sh
 uv run repub cleanup-media --feeds-dir out/feeds --days 25 --dry-run
 ```
 - Remove `--dry-run` to delete matching files. The command protects media
  referenced by the latest published feed and uses a lock to avoid racing with
  active crawls.
 - For config-driven deployments, pass the runtime config so cleanup uses the
  configured `out_dir` and media directory names:
 ```sh
 uv run repub cleanup-media --config repub.toml --dry-run
 ```
 The legacy one-shot config-driven crawler is still available:
 ```sh
--- a/demo/README.md
+++ b/demo/README.md
@ -30,6 +30,20 @@ variants are written under `images/thumbs/` inside each feed output directory.
 Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
 when a demo run needs to disable thumbnails or test a different profile set.
 ## Media Cleanup
 Published media can remain on disk after articles fall out of the current feed.
 Run cleanup in dry-run mode first:
 ```shell
 uv run repub cleanup-media --config demo/repub.toml --dry-run
 ```
 With `--config`, cleanup scans `demo/out/feeds/` and honors any
 `REPUBLISHER_*_DIR` media directory overrides in the config. Remove `--dry-run`
 to delete old unreferenced media. The default retention window is 25 days; use
 `--days N` to override it.
 ## Local File Feed
 `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
--- a/demo/repub.toml
+++ b/demo/repub.toml
@ -22,3 +22,8 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
 # images plus JPEG thumbnails.
 # REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
 # REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
 # Media cleanup can use this config:
 # uv run repub cleanup-media --config demo/repub.toml --dry-run
 # It scans out_dir/feeds, honors REPUBLISHER_*_DIR overrides, and defaults to a
 # 25-day retention window for old media not referenced by the latest feed.rss.
--- a/repub/cleanup.py
+++ b/repub/cleanup.py
@ -0,0 +1,188 @@
 from __future__ import annotations
 import fcntl
 import html
 import re
 import sys
 from collections.abc import Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import UTC, datetime, timedelta
 from pathlib import Path
 from typing import TextIO
 from urllib.parse import unquote, urlsplit
 DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files")
 MEDIA_RETENTION_LOCK = ".media-retention.lock"
@dataclass
 class CleanupResult:
    scanned_root: Path
    cutoff: datetime
    dry_run: bool
    matched_files: int = 0
    deleted_files: int = 0
    bytes_deleted: int = 0
    failures: int = 0
 def _bool_text(value: bool) -> str:
    return "true" if value else "false"
 def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]:
    normalized = tuple(
        dict.fromkeys(
            normalized_dir
            for media_dir in media_dirs
            if (normalized_dir := media_dir.strip("/"))
        )
    )
    if not normalized:
        raise ValueError("media_dirs must include at least one media directory")
    return normalized
 def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]:
    media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs)
    return re.compile(
        rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)"""
    )
@contextmanager
 def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]:
    out_dir = out_dir.resolve()
    out_dir.mkdir(parents=True, exist_ok=True)
    lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
    with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file:
        fcntl.flock(lock_file.fileno(), lock_mode)
        try:
            yield
        finally:
            fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
 def _feed_dirs(feeds_dir: Path) -> list[Path]:
    if not feeds_dir.exists():
        return []
    return sorted(path for path in feeds_dir.iterdir() if path.is_dir())
 def _referenced_media_paths(
    feed_dir: Path, feed_body: str, media_dirs: Sequence[str]
 ) -> set[Path]:
    protected: set[Path] = set()
    slug = feed_dir.name
    feed_prefix = f"/feeds/{slug}/"
    feed_root = feed_dir.resolve()
    media_dir_set = set(media_dirs)
    for match in _feed_reference_re(media_dirs).finditer(feed_body):
        reference = html.unescape(match.group(0))
        path = unquote(urlsplit(reference).path)
        if path.startswith(feed_prefix):
            relative_path = path.removeprefix(feed_prefix)
        else:
            relative_path = path.lstrip("/")
        if relative_path.split("/", maxsplit=1)[0] not in media_dir_set:
            continue
        candidate = (feed_dir / relative_path).resolve()
        if candidate.is_relative_to(feed_root):
            protected.add(candidate)
    return protected
 def collect_protected_paths(
    feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS
 ) -> set[Path]:
    media_dirs = _normalize_media_dirs(media_dirs)
    protected: set[Path] = set()
    for feed_dir in _feed_dirs(feeds_dir):
        feed_path = feed_dir / "feed.rss"
        if not feed_path.exists():
            continue
        protected.update(
            _referenced_media_paths(
                feed_dir,
                feed_path.read_text(encoding="utf-8", errors="replace"),
                media_dirs,
            )
        )
    return protected
 def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]:
    files: list[Path] = []
    for feed_dir in _feed_dirs(feeds_dir):
        for media_dir_name in media_dirs:
            media_dir = feed_dir / media_dir_name
            if not media_dir.exists():
                continue
            files.extend(path for path in media_dir.rglob("*") if path.is_file())
    return sorted(files)
 def cleanup_media(
    *,
    feeds_dir: Path,
    retention_days: int = 25,
    now: datetime | None = None,
    dry_run: bool = False,
    output: TextIO = sys.stdout,
    media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS,
 ) -> CleanupResult:
    if now is None:
        now = datetime.now(UTC)
    elif now.tzinfo is None:
        now = now.replace(tzinfo=UTC)
    cutoff = now - timedelta(days=retention_days)
    cutoff_timestamp = cutoff.timestamp()
    feeds_dir = feeds_dir.resolve()
    media_dirs = _normalize_media_dirs(media_dirs)
    with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True):
        protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs)
        result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run)
        for path in _media_files(feeds_dir, media_dirs):
            try:
                stat = path.stat()
            except OSError as error:
                result.failures += 1
                print(
                    f"media cleanup: stat failed path={path} error={error}",
                    file=output,
                )
                continue
            if stat.st_mtime >= cutoff_timestamp:
                continue
            if path.resolve() in protected:
                continue
            result.matched_files += 1
            if dry_run:
                continue
            try:
                path.unlink()
            except OSError as error:
                result.failures += 1
                print(
                    f"media cleanup: delete failed path={path} error={error}",
                    file=output,
                )
                continue
            result.deleted_files += 1
            result.bytes_deleted += stat.st_size
        print(
            "media cleanup: "
            f"dry_run={_bool_text(result.dry_run)} "
            f"cutoff={result.cutoff.isoformat()} "
            f"root={result.scanned_root} "
            f"matched_files={result.matched_files} "
            f"deleted_files={result.deleted_files} "
            f"bytes_deleted={result.bytes_deleted} "
            f"failures={result.failures}",
            file=output,
        )
    return result
--- a/repub/crawl.py
+++ b/repub/crawl.py
@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess
 from scrapy.settings import Settings
 from twisted.python.failure import Failure
 from repub.cleanup import media_retention_lock
 from repub.config import (
    FeedConfig,
    build_base_settings,
@ -103,8 +104,9 @@ def run_feeds(
        deferred.addCallbacks(handle_success, handle_error)
        deferred.addBoth(crawl_next)
-    crawl_next()
+    with media_retention_lock(out_dir=out_dir, exclusive=False):
-    process.start(stop_after_crawl=False)
+        crawl_next()
        process.start(stop_after_crawl=False)
    return 1 if any(failure is not None for _, failure in results) else 0
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -7,11 +7,21 @@ import os
 import signal
 import sys
 from contextlib import suppress
 from pathlib import Path
 from hypercorn.asyncio import serve as hypercorn_serve
 from hypercorn.config import Config as HypercornConfig
 import repub.crawl as crawl_module
 from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media
 from repub.config import (
    AUDIO_DIR,
    FILE_DIR,
    IMAGE_DIR,
    VIDEO_DIR,
    build_base_settings,
    load_config,
 )
 from repub.web import SHUTDOWN_EVENT_KEY, create_app
 FeedNameFilter = crawl_module.FeedNameFilter
@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
        default="repub.toml",
        help="Path to runtime config TOML file",
    )
    cleanup_parser = subparsers.add_parser(
        "cleanup-media",
        help="Delete old unreferenced published media",
    )
    cleanup_parser.add_argument(
        "-c",
        "--config",
        default=None,
        help="Read output and media directory settings from runtime config TOML",
    )
    cleanup_parser.add_argument(
        "--feeds-dir",
        default=None,
        help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
    )
    cleanup_parser.add_argument(
        "--days",
        type=int,
        default=25,
        help="Delete unreferenced media older than this many days",
    )
    cleanup_parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Report cleanup matches without deleting files",
    )
    if not raw_args:
        raw_args = ["serve", "--dev-mode"]
    elif raw_args[0] in {"-c", "--config"}:
        raw_args = ["crawl", *raw_args]
-    elif raw_args[0] not in {"serve", "crawl"}:
+    elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}:
        raw_args = ["serve", "--dev-mode", *raw_args]
    args = parser.parse_args(raw_args)
@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
    return command, args
 def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
    feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
    media_dirs = DEFAULT_MEDIA_DIRS
    if args.config is None:
        return feeds_dir, media_dirs
    config = load_config(args.config)
    settings = build_base_settings(config)
    media_dirs = (
        str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)),
        str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)),
        str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)),
        str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)),
    )
    if args.feeds_dir is None:
        feeds_dir = config.out_dir / "feeds"
    return feeds_dir, media_dirs
 def _install_signal_handlers(stop_event: asyncio.Event) -> None:
    loop = asyncio.get_running_loop()
@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int:
        crawl_module.check_runtime = check_runtime
        return crawl_module.crawl_from_config(args.config)
    if command == "cleanup-media":
        try:
            feeds_dir, media_dirs = _cleanup_config(args)
        except FileNotFoundError as error:
            missing_path = (
                Path(error.filename).expanduser()
                if error.filename
                else Path(args.config).expanduser()
            )
            logger.error("Config file not found: %s", missing_path)
            return 2
        except ValueError as error:
            logger.error("Invalid config: %s", error)
            return 2
        result = cleanup_media(
            feeds_dir=feeds_dir,
            retention_days=args.days,
            dry_run=bool(args.dry_run),
            media_dirs=media_dirs,
        )
        return 1 if result.failures else 0
    try:
        port = int(args.port)
    except ValueError:
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
 from scrapy.statscollectors import StatsCollector
 from twisted.python.failure import Failure
 from repub.cleanup import media_retention_lock
 from repub.config import (
    FeedConfig,
    RepublisherConfig,
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
    stats_path = Path(args.stats_path).resolve()
    log_path = stats_path.with_suffix(".log")
-    try:
+    with media_retention_lock(out_dir=out_dir, exclusive=False):
        feed = _resolve_feed(
            source_config=source_config,
            out_dir=out_dir,
            log_path=log_path,
        )
        process = CrawlerProcess(
            _build_crawl_settings(
                out_dir=out_dir,
                feed=feed,
                stats_path=stats_path,
                convert_images=source_config.convert_images,
                convert_video=source_config.convert_video,
                feed_url=load_feed_url(),
            )
        )
        print(
            f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
            flush=True,
        )
        exit_code = _run_crawl(
            process=process,
            feed=feed,
            spider_arguments=source_config.spider_arguments,
        )
    except Exception as error:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
            flush=True,
        )
        return 1
    if stop_requested:
        print(
            f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
            flush=True,
        )
        return 130
    if exit_code == 0:
        try:
-            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            feed = _resolve_feed(
                source_config=source_config,
                out_dir=out_dir,
                log_path=log_path,
            )
            process = CrawlerProcess(
                _build_crawl_settings(
                    out_dir=out_dir,
                    feed=feed,
                    stats_path=stats_path,
                    convert_images=source_config.convert_images,
                    convert_video=source_config.convert_video,
                    feed_url=load_feed_url(),
                )
            )
            print(
                f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
                flush=True,
            )
            exit_code = _run_crawl(
                process=process,
                feed=feed,
                spider_arguments=source_config.spider_arguments,
            )
        except Exception as error:
            print(
-                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
                flush=True,
            )
            return 1
-        print(
+
-            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
+        if stop_requested:
-            flush=True,
+            print(
-        )
+                f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
-    return exit_code
+                flush=True,
            )
            return 130
        if exit_code == 0:
            try:
                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
            except Exception as error:
                print(
                    f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
                    flush=True,
                )
                return 1
            print(
                f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
                flush=True,
            )
        return exit_code
 def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
--- a/tests/test_cleanup.py
+++ b/tests/test_cleanup.py
@ -0,0 +1,200 @@
 import fcntl
 import io
 import os
 import subprocess
 import sys
 import time
 from datetime import UTC, datetime, timedelta
 from pathlib import Path
 from repub.cleanup import cleanup_media
 NOW = datetime(2026, 5, 27, 12, 0, tzinfo=UTC)
 def write_media(path: Path, body: bytes, *, age_days: int) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_bytes(body)
    timestamp = (NOW - timedelta(days=age_days)).timestamp()
    os.utime(path, (timestamp, timestamp))
 def wait_until(path: Path, *, timeout: float = 5.0) -> None:
    deadline = time.monotonic() + timeout
    while time.monotonic() < deadline:
        if path.exists():
            return
        time.sleep(0.05)
    raise AssertionError(f"timed out waiting for {path}")
 def test_cleanup_media_deletes_old_unreferenced_media_and_protects_latest_feed_refs(
    tmp_path: Path,
 ) -> None:
    feeds_dir = tmp_path / "feeds"
    demo_dir = feeds_dir / "demo"
    demo_dir.mkdir(parents=True)
    (demo_dir / "feed.rss").write_text(
        """
 <rss xmlns:content="http://purl.org/rss/1.0/modules/content/">
  <channel>
    <item>
      <enclosure url="https://mirror.example/feeds/demo/audio/current.mp3" />
      <media:thumbnail xmlns:media="http://search.yahoo.com/mrss/" url="/images/thumbs/current.jpg" />
      <content:encoded><![CDATA[<img src="images/full/current.webp">]]></content:encoded>
    </item>
  </channel>
 </rss>
 """.strip(),
        encoding="utf-8",
    )
    write_media(demo_dir / "audio" / "current.mp3", b"audio", age_days=40)
    write_media(demo_dir / "images" / "full" / "current.webp", b"webp", age_days=40)
    write_media(demo_dir / "images" / "thumbs" / "current.jpg", b"jpg", age_days=40)
    write_media(demo_dir / "images" / "source" / "current.png", b"source", age_days=40)
    write_media(demo_dir / "video" / "old.mp4", b"video", age_days=40)
    write_media(demo_dir / "files" / "fresh.pdf", b"fresh", age_days=2)
    write_media(demo_dir / "images" / "full" / "old.webp", b"old", age_days=40)
    write_media(demo_dir / ".feed.rss.next", b"staged", age_days=40)
    output = io.StringIO()
    result = cleanup_media(
        feeds_dir=feeds_dir,
        retention_days=25,
        now=NOW,
        dry_run=False,
        output=output,
    )
    assert (demo_dir / "audio" / "current.mp3").exists()
    assert (demo_dir / "images" / "full" / "current.webp").exists()
    assert (demo_dir / "images" / "thumbs" / "current.jpg").exists()
    assert not (demo_dir / "images" / "source" / "current.png").exists()
    assert not (demo_dir / "video" / "old.mp4").exists()
    assert not (demo_dir / "images" / "full" / "old.webp").exists()
    assert (demo_dir / "files" / "fresh.pdf").exists()
    assert (demo_dir / ".feed.rss.next").exists()
    assert result.matched_files == 3
    assert result.deleted_files == 3
    assert result.bytes_deleted == len(b"source") + len(b"video") + len(b"old")
    assert result.failures == 0
    assert "dry_run=false" in output.getvalue()
    assert "deleted_files=3" in output.getvalue()
 def test_cleanup_media_dry_run_reports_matches_without_deleting(tmp_path: Path) -> None:
    feeds_dir = tmp_path / "feeds"
    old_file = feeds_dir / "demo" / "audio" / "old.mp3"
    write_media(old_file, b"audio", age_days=40)
    result = cleanup_media(
        feeds_dir=feeds_dir,
        retention_days=25,
        now=NOW,
        dry_run=True,
        output=io.StringIO(),
    )
    assert old_file.exists()
    assert result.matched_files == 1
    assert result.deleted_files == 0
    assert result.bytes_deleted == 0
    assert result.failures == 0
 def test_cleanup_media_uses_configured_media_dirs(tmp_path: Path) -> None:
    feeds_dir = tmp_path / "feeds"
    demo_dir = feeds_dir / "demo"
    demo_dir.mkdir(parents=True)
    (demo_dir / "feed.rss").write_text(
        """
 <rss>
  <channel>
    <item>
      <enclosure url="https://mirror.example/feeds/demo/audio-custom/current.mp3" />
      <media:content xmlns:media="http://search.yahoo.com/mrss/" url="/videos-custom/current.mp4" />
    </item>
  </channel>
 </rss>
 """.strip(),
        encoding="utf-8",
    )
    write_media(demo_dir / "audio-custom" / "current.mp3", b"current", age_days=40)
    write_media(demo_dir / "audio-custom" / "old.mp3", b"old", age_days=40)
    write_media(demo_dir / "videos-custom" / "current.mp4", b"video", age_days=40)
    write_media(demo_dir / "audio" / "legacy.mp3", b"legacy", age_days=40)
    result = cleanup_media(
        feeds_dir=feeds_dir,
        retention_days=25,
        now=NOW,
        media_dirs=("audio-custom", "videos-custom"),
        output=io.StringIO(),
    )
    assert (demo_dir / "audio-custom" / "current.mp3").exists()
    assert not (demo_dir / "audio-custom" / "old.mp3").exists()
    assert (demo_dir / "videos-custom" / "current.mp4").exists()
    assert (demo_dir / "audio" / "legacy.mp3").exists()
    assert result.matched_files == 1
    assert result.deleted_files == 1
    assert result.failures == 0
 def test_cleanup_media_waits_for_active_crawl_media_lock(tmp_path: Path) -> None:
    out_dir = tmp_path / "out"
    feeds_dir = out_dir / "feeds"
    old_file = feeds_dir / "demo" / "audio" / "old.mp3"
    write_media(old_file, b"audio", age_days=40)
    lock_path = out_dir / ".media-retention.lock"
    lock_path.parent.mkdir(parents=True, exist_ok=True)
    started_path = tmp_path / "cleanup-started"
    done_path = tmp_path / "cleanup-done"
    script = """
 import io
 import sys
 from datetime import UTC, datetime
 from pathlib import Path
 from repub.cleanup import cleanup_media
 Path(sys.argv[2]).write_text("started", encoding="utf-8")
 cleanup_media(
    feeds_dir=Path(sys.argv[1]),
    retention_days=25,
    now=datetime(2026, 5, 27, 12, 0, tzinfo=UTC),
    output=io.StringIO(),
 )
 Path(sys.argv[3]).write_text("done", encoding="utf-8")
 """
    with lock_path.open("a", encoding="utf-8") as lock_file:
        fcntl.flock(lock_file.fileno(), fcntl.LOCK_SH)
        process = subprocess.Popen(
            [
                sys.executable,
                "-c",
                script,
                str(feeds_dir),
                str(started_path),
                str(done_path),
            ],
            cwd=Path.cwd(),
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE,
            text=True,
        )
        try:
            wait_until(started_path)
            time.sleep(0.5)
            assert old_file.exists()
            assert process.poll() is None
            assert not done_path.exists()
        finally:
            fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
    stdout, stderr = process.communicate(timeout=5)
    assert process.returncode == 0, stdout + stderr
    assert not old_file.exists()
    assert done_path.exists()
--- a/tests/test_entrypoint.py
+++ b/tests/test_entrypoint.py
@ -39,6 +39,102 @@ def test_parse_args_supports_dev_mode_flag() -> None:
    assert args.dev_mode is True
 def test_parse_args_supports_cleanup_media_defaults() -> None:
    command, args = parse_args(["cleanup-media"])
    assert command == "cleanup-media"
    assert args.config is None
    assert args.feeds_dir is None
    assert args.days == 25
    assert args.dry_run is False
 def test_entrypoint_runs_cleanup_media(monkeypatch, tmp_path) -> None:
    recorded: dict[str, object] = {}
    class FakeResult:
        failures = 0
    def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
        recorded["feeds_dir"] = feeds_dir
        recorded["retention_days"] = retention_days
        recorded["dry_run"] = dry_run
        recorded["media_dirs"] = media_dirs
        return FakeResult()
    monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
    exit_code = entrypoint(
        [
            "cleanup-media",
            "--feeds-dir",
            str(tmp_path / "feeds"),
            "--days",
            "10",
            "--dry-run",
        ]
    )
    assert exit_code == 0
    assert recorded == {
        "feeds_dir": tmp_path / "feeds",
        "retention_days": 10,
        "dry_run": True,
        "media_dirs": ("images", "audio", "video", "files"),
    }
 def test_entrypoint_cleanup_media_uses_configured_media_dirs(
    monkeypatch, tmp_path
 ) -> None:
    config_path = tmp_path / "repub.toml"
    config_path.write_text(
        """
 out_dir = "mirror"
 [[feeds]]
 name = "Demo"
 slug = "demo"
 url = "https://source.example/feed.rss"
 [scrapy.settings]
 REPUBLISHER_IMAGE_DIR = "images-custom"
 REPUBLISHER_AUDIO_DIR = "audio-custom"
 REPUBLISHER_VIDEO_DIR = "videos-custom"
 REPUBLISHER_FILE_DIR = "files-custom"
 """.strip(),
        encoding="utf-8",
    )
    recorded: dict[str, object] = {}
    class FakeResult:
        failures = 0
    def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
        recorded["feeds_dir"] = feeds_dir
        recorded["retention_days"] = retention_days
        recorded["dry_run"] = dry_run
        recorded["media_dirs"] = media_dirs
        return FakeResult()
    monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
    exit_code = entrypoint(["cleanup-media", "--config", str(config_path)])
    assert exit_code == 0
    assert recorded == {
        "feeds_dir": tmp_path / "mirror" / "feeds",
        "retention_days": 25,
        "dry_run": False,
        "media_dirs": (
            "images-custom",
            "audio-custom",
            "videos-custom",
            "files-custom",
        ),
    }
 def test_parse_args_defaults_to_dev_mode_when_no_args() -> None:
    command, args = parse_args([])
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@ -1,3 +1,5 @@
 import subprocess
 import sys
 from pathlib import Path
 import pytest
@ -72,6 +74,66 @@ def test_main_publishes_staged_feed_after_successful_crawl(
    assert not staged_path.exists()
 def test_main_holds_media_cleanup_lock_during_crawl(
    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
    out_dir = tmp_path / "out"
    public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
    staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
    public_path.parent.mkdir(parents=True)
    public_path.write_text("<rss>old</rss>\n", encoding="utf-8")
    staged_path.write_text(VALID_FEED, encoding="utf-8")
    def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int:
        lock_path = out_dir.resolve() / ".media-retention.lock"
        script = """
 import fcntl
 import sys
 from pathlib import Path
 lock_path = Path(sys.argv[1])
 lock_path.parent.mkdir(parents=True, exist_ok=True)
 with lock_path.open("a", encoding="utf-8") as lock_file:
    try:
        fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
    except BlockingIOError:
        sys.exit(0)
    else:
        fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
        sys.exit(2)
 """
        completed = subprocess.run(
            [sys.executable, "-c", script, str(lock_path)],
            cwd=Path.cwd(),
            capture_output=True,
            check=False,
            text=True,
        )
        assert completed.returncode == 0, completed.stdout + completed.stderr
        return 0
    _patch_worker_dependencies(
        monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held
    )
    exit_code = job_runner_module.main(
        [
            "--job-id",
            "1",
            "--execution-id",
            "2",
            "--db-path",
            str(tmp_path / "republisher.db"),
            "--out-dir",
            str(out_dir),
            "--stats-path",
            str(tmp_path / "stats.jsonl"),
        ]
    )
    assert exit_code == 0
 def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None:
@ -137,7 +199,7 @@ def test_main_does_not_publish_staged_feed_after_failed_crawl(
 def _patch_worker_dependencies(
-    monkeypatch: pytest.MonkeyPatch, *, exit_code: int
+    monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None
 ) -> None:
    monkeypatch.setattr(
        job_runner_module,
@ -161,5 +223,5 @@ def _patch_worker_dependencies(
    monkeypatch.setattr(
        job_runner_module,
        "_run_crawl",
-        lambda *, process, feed, spider_arguments: exit_code,
+        run_crawl or (lambda *, process, feed, spider_arguments: exit_code),
    )