diff --git a/README.md b/README.md
index cab926d..b48c22a 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,27 @@ Operational notes:
Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs.
- Job logs and stats artifacts are written under `out/logs/`.
+Media cleanup:
+
+- Published media can outlive the current feed when articles fall out of the
+ feed window. Use `cleanup-media` to delete old media files that are no longer
+ referenced by the latest published `feed.rss`.
+- The default retention window is 25 days. Run a dry run first:
+
+```sh
+uv run repub cleanup-media --feeds-dir out/feeds --days 25 --dry-run
+```
+
+- Remove `--dry-run` to delete matching files. The command protects media
+ referenced by the latest published feed and uses a lock to avoid racing with
+ active crawls.
+- For config-driven deployments, pass the runtime config so cleanup uses the
+ configured `out_dir` and media directory names:
+
+```sh
+uv run repub cleanup-media --config repub.toml --dry-run
+```
+
The legacy one-shot config-driven crawler is still available:
```sh
diff --git a/demo/README.md b/demo/README.md
index af4f0b8..652959e 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -30,6 +30,20 @@ variants are written under `images/thumbs/` inside each feed output directory.
Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml)
when a demo run needs to disable thumbnails or test a different profile set.
+## Media Cleanup
+
+Published media can remain on disk after articles fall out of the current feed.
+Run cleanup in dry-run mode first:
+
+```shell
+uv run repub cleanup-media --config demo/repub.toml --dry-run
+```
+
+With `--config`, cleanup scans `demo/out/feeds/` and honors any
+`REPUBLISHER_*_DIR` media directory overrides in the config. Remove `--dry-run`
+to delete old unreferenced media. The default retention window is 25 days; use
+`--days N` to override it.
+
## Local File Feed
`repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root:
diff --git a/demo/repub.toml b/demo/repub.toml
index d829325..df8aa67 100644
--- a/demo/repub.toml
+++ b/demo/repub.toml
@@ -22,3 +22,8 @@ REPUBLISHER_FEED_URL = "https://mirror.example"
# images plus JPEG thumbnails.
# REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true
# REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true
+
+# Media cleanup can use this config:
+# uv run repub cleanup-media --config demo/repub.toml --dry-run
+# It scans out_dir/feeds, honors REPUBLISHER_*_DIR overrides, and defaults to a
+# 25-day retention window for old media not referenced by the latest feed.rss.
diff --git a/repub/cleanup.py b/repub/cleanup.py
new file mode 100644
index 0000000..387c62a
--- /dev/null
+++ b/repub/cleanup.py
@@ -0,0 +1,188 @@
+from __future__ import annotations
+
+import fcntl
+import html
+import re
+import sys
+from collections.abc import Iterator, Sequence
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import UTC, datetime, timedelta
+from pathlib import Path
+from typing import TextIO
+from urllib.parse import unquote, urlsplit
+
+DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files")
+MEDIA_RETENTION_LOCK = ".media-retention.lock"
+
+
+@dataclass
+class CleanupResult:
+ scanned_root: Path
+ cutoff: datetime
+ dry_run: bool
+ matched_files: int = 0
+ deleted_files: int = 0
+ bytes_deleted: int = 0
+ failures: int = 0
+
+
+def _bool_text(value: bool) -> str:
+ return "true" if value else "false"
+
+
+def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]:
+ normalized = tuple(
+ dict.fromkeys(
+ normalized_dir
+ for media_dir in media_dirs
+ if (normalized_dir := media_dir.strip("/"))
+ )
+ )
+ if not normalized:
+ raise ValueError("media_dirs must include at least one media directory")
+ return normalized
+
+
+def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]:
+ media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs)
+ return re.compile(
+ rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)"""
+ )
+
+
+@contextmanager
+def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]:
+ out_dir = out_dir.resolve()
+ out_dir.mkdir(parents=True, exist_ok=True)
+ lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
+ with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file:
+ fcntl.flock(lock_file.fileno(), lock_mode)
+ try:
+ yield
+ finally:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+
+def _feed_dirs(feeds_dir: Path) -> list[Path]:
+ if not feeds_dir.exists():
+ return []
+ return sorted(path for path in feeds_dir.iterdir() if path.is_dir())
+
+
+def _referenced_media_paths(
+ feed_dir: Path, feed_body: str, media_dirs: Sequence[str]
+) -> set[Path]:
+ protected: set[Path] = set()
+ slug = feed_dir.name
+ feed_prefix = f"/feeds/{slug}/"
+ feed_root = feed_dir.resolve()
+ media_dir_set = set(media_dirs)
+ for match in _feed_reference_re(media_dirs).finditer(feed_body):
+ reference = html.unescape(match.group(0))
+ path = unquote(urlsplit(reference).path)
+ if path.startswith(feed_prefix):
+ relative_path = path.removeprefix(feed_prefix)
+ else:
+ relative_path = path.lstrip("/")
+ if relative_path.split("/", maxsplit=1)[0] not in media_dir_set:
+ continue
+ candidate = (feed_dir / relative_path).resolve()
+ if candidate.is_relative_to(feed_root):
+ protected.add(candidate)
+ return protected
+
+
+def collect_protected_paths(
+ feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS
+) -> set[Path]:
+ media_dirs = _normalize_media_dirs(media_dirs)
+ protected: set[Path] = set()
+ for feed_dir in _feed_dirs(feeds_dir):
+ feed_path = feed_dir / "feed.rss"
+ if not feed_path.exists():
+ continue
+ protected.update(
+ _referenced_media_paths(
+ feed_dir,
+ feed_path.read_text(encoding="utf-8", errors="replace"),
+ media_dirs,
+ )
+ )
+ return protected
+
+
+def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]:
+ files: list[Path] = []
+ for feed_dir in _feed_dirs(feeds_dir):
+ for media_dir_name in media_dirs:
+ media_dir = feed_dir / media_dir_name
+ if not media_dir.exists():
+ continue
+ files.extend(path for path in media_dir.rglob("*") if path.is_file())
+ return sorted(files)
+
+
+def cleanup_media(
+ *,
+ feeds_dir: Path,
+ retention_days: int = 25,
+ now: datetime | None = None,
+ dry_run: bool = False,
+ output: TextIO = sys.stdout,
+ media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS,
+) -> CleanupResult:
+ if now is None:
+ now = datetime.now(UTC)
+ elif now.tzinfo is None:
+ now = now.replace(tzinfo=UTC)
+
+ cutoff = now - timedelta(days=retention_days)
+ cutoff_timestamp = cutoff.timestamp()
+ feeds_dir = feeds_dir.resolve()
+ media_dirs = _normalize_media_dirs(media_dirs)
+ with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True):
+ protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs)
+ result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run)
+
+ for path in _media_files(feeds_dir, media_dirs):
+ try:
+ stat = path.stat()
+ except OSError as error:
+ result.failures += 1
+ print(
+ f"media cleanup: stat failed path={path} error={error}",
+ file=output,
+ )
+ continue
+ if stat.st_mtime >= cutoff_timestamp:
+ continue
+ if path.resolve() in protected:
+ continue
+ result.matched_files += 1
+ if dry_run:
+ continue
+ try:
+ path.unlink()
+ except OSError as error:
+ result.failures += 1
+ print(
+ f"media cleanup: delete failed path={path} error={error}",
+ file=output,
+ )
+ continue
+ result.deleted_files += 1
+ result.bytes_deleted += stat.st_size
+
+ print(
+ "media cleanup: "
+ f"dry_run={_bool_text(result.dry_run)} "
+ f"cutoff={result.cutoff.isoformat()} "
+ f"root={result.scanned_root} "
+ f"matched_files={result.matched_files} "
+ f"deleted_files={result.deleted_files} "
+ f"bytes_deleted={result.bytes_deleted} "
+ f"failures={result.failures}",
+ file=output,
+ )
+ return result
diff --git a/repub/crawl.py b/repub/crawl.py
index 6f0d9f4..3c7f41b 100644
--- a/repub/crawl.py
+++ b/repub/crawl.py
@@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
+from repub.cleanup import media_retention_lock
from repub.config import (
FeedConfig,
build_base_settings,
@@ -103,8 +104,9 @@ def run_feeds(
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
- crawl_next()
- process.start(stop_after_crawl=False)
+ with media_retention_lock(out_dir=out_dir, exclusive=False):
+ crawl_next()
+ process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0
diff --git a/repub/entrypoint.py b/repub/entrypoint.py
index 1cc7932..4d0bc83 100644
--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@@ -7,11 +7,21 @@ import os
import signal
import sys
from contextlib import suppress
+from pathlib import Path
from hypercorn.asyncio import serve as hypercorn_serve
from hypercorn.config import Config as HypercornConfig
import repub.crawl as crawl_module
+from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media
+from repub.config import (
+ AUDIO_DIR,
+ FILE_DIR,
+ IMAGE_DIR,
+ VIDEO_DIR,
+ build_base_settings,
+ load_config,
+)
from repub.web import SHUTDOWN_EVENT_KEY, create_app
FeedNameFilter = crawl_module.FeedNameFilter
@@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
default="repub.toml",
help="Path to runtime config TOML file",
)
+
+ cleanup_parser = subparsers.add_parser(
+ "cleanup-media",
+ help="Delete old unreferenced published media",
+ )
+ cleanup_parser.add_argument(
+ "-c",
+ "--config",
+ default=None,
+ help="Read output and media directory settings from runtime config TOML",
+ )
+ cleanup_parser.add_argument(
+ "--feeds-dir",
+ default=None,
+ help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
+ )
+ cleanup_parser.add_argument(
+ "--days",
+ type=int,
+ default=25,
+ help="Delete unreferenced media older than this many days",
+ )
+ cleanup_parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="Report cleanup matches without deleting files",
+ )
+
if not raw_args:
raw_args = ["serve", "--dev-mode"]
elif raw_args[0] in {"-c", "--config"}:
raw_args = ["crawl", *raw_args]
- elif raw_args[0] not in {"serve", "crawl"}:
+ elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}:
raw_args = ["serve", "--dev-mode", *raw_args]
args = parser.parse_args(raw_args)
@@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
return command, args
+def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
+ feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
+ media_dirs = DEFAULT_MEDIA_DIRS
+ if args.config is None:
+ return feeds_dir, media_dirs
+
+ config = load_config(args.config)
+ settings = build_base_settings(config)
+ media_dirs = (
+ str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)),
+ str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)),
+ str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)),
+ str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)),
+ )
+ if args.feeds_dir is None:
+ feeds_dir = config.out_dir / "feeds"
+ return feeds_dir, media_dirs
+
+
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
loop = asyncio.get_running_loop()
@@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int:
crawl_module.check_runtime = check_runtime
return crawl_module.crawl_from_config(args.config)
+ if command == "cleanup-media":
+ try:
+ feeds_dir, media_dirs = _cleanup_config(args)
+ except FileNotFoundError as error:
+ missing_path = (
+ Path(error.filename).expanduser()
+ if error.filename
+ else Path(args.config).expanduser()
+ )
+ logger.error("Config file not found: %s", missing_path)
+ return 2
+ except ValueError as error:
+ logger.error("Invalid config: %s", error)
+ return 2
+
+ result = cleanup_media(
+ feeds_dir=feeds_dir,
+ retention_days=args.days,
+ dry_run=bool(args.dry_run),
+ media_dirs=media_dirs,
+ )
+ return 1 if result.failures else 0
+
try:
port = int(args.port)
except ValueError:
diff --git a/repub/job_runner.py b/repub/job_runner.py
index 008bd15..97abcf0 100644
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
from scrapy.statscollectors import StatsCollector
from twisted.python.failure import Failure
+from repub.cleanup import media_retention_lock
from repub.config import (
FeedConfig,
RepublisherConfig,
@@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
stats_path = Path(args.stats_path).resolve()
log_path = stats_path.with_suffix(".log")
- try:
- feed = _resolve_feed(
- source_config=source_config,
- out_dir=out_dir,
- log_path=log_path,
- )
- process = CrawlerProcess(
- _build_crawl_settings(
- out_dir=out_dir,
- feed=feed,
- stats_path=stats_path,
- convert_images=source_config.convert_images,
- convert_video=source_config.convert_video,
- feed_url=load_feed_url(),
- )
- )
- print(
- f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
- flush=True,
- )
- exit_code = _run_crawl(
- process=process,
- feed=feed,
- spider_arguments=source_config.spider_arguments,
- )
- except Exception as error:
- print(
- f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
- flush=True,
- )
- return 1
-
- if stop_requested:
- print(
- f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
- flush=True,
- )
- return 130
-
- if exit_code == 0:
+ with media_retention_lock(out_dir=out_dir, exclusive=False):
try:
- publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+ feed = _resolve_feed(
+ source_config=source_config,
+ out_dir=out_dir,
+ log_path=log_path,
+ )
+ process = CrawlerProcess(
+ _build_crawl_settings(
+ out_dir=out_dir,
+ feed=feed,
+ stats_path=stats_path,
+ convert_images=source_config.convert_images,
+ convert_video=source_config.convert_video,
+ feed_url=load_feed_url(),
+ )
+ )
+ print(
+ f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
+ flush=True,
+ )
+ exit_code = _run_crawl(
+ process=process,
+ feed=feed,
+ spider_arguments=source_config.spider_arguments,
+ )
except Exception as error:
print(
- f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+ f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
- print(
- f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
- flush=True,
- )
- return exit_code
+
+ if stop_requested:
+ print(
+ f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
+ flush=True,
+ )
+ return 130
+
+ if exit_code == 0:
+ try:
+ publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+ except Exception as error:
+ print(
+ f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+ flush=True,
+ )
+ return 1
+ print(
+ f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
+ flush=True,
+ )
+ return exit_code
def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig:
diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py
new file mode 100644
index 0000000..c5d9748
--- /dev/null
+++ b/tests/test_cleanup.py
@@ -0,0 +1,200 @@
+import fcntl
+import io
+import os
+import subprocess
+import sys
+import time
+from datetime import UTC, datetime, timedelta
+from pathlib import Path
+
+from repub.cleanup import cleanup_media
+
+NOW = datetime(2026, 5, 27, 12, 0, tzinfo=UTC)
+
+
+def write_media(path: Path, body: bytes, *, age_days: int) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_bytes(body)
+ timestamp = (NOW - timedelta(days=age_days)).timestamp()
+ os.utime(path, (timestamp, timestamp))
+
+
+def wait_until(path: Path, *, timeout: float = 5.0) -> None:
+ deadline = time.monotonic() + timeout
+ while time.monotonic() < deadline:
+ if path.exists():
+ return
+ time.sleep(0.05)
+ raise AssertionError(f"timed out waiting for {path}")
+
+
+def test_cleanup_media_deletes_old_unreferenced_media_and_protects_latest_feed_refs(
+ tmp_path: Path,
+) -> None:
+ feeds_dir = tmp_path / "feeds"
+ demo_dir = feeds_dir / "demo"
+ demo_dir.mkdir(parents=True)
+ (demo_dir / "feed.rss").write_text(
+ """
+
+
+ -
+
+
+ ]]>
+
+
+
+""".strip(),
+ encoding="utf-8",
+ )
+ write_media(demo_dir / "audio" / "current.mp3", b"audio", age_days=40)
+ write_media(demo_dir / "images" / "full" / "current.webp", b"webp", age_days=40)
+ write_media(demo_dir / "images" / "thumbs" / "current.jpg", b"jpg", age_days=40)
+ write_media(demo_dir / "images" / "source" / "current.png", b"source", age_days=40)
+ write_media(demo_dir / "video" / "old.mp4", b"video", age_days=40)
+ write_media(demo_dir / "files" / "fresh.pdf", b"fresh", age_days=2)
+ write_media(demo_dir / "images" / "full" / "old.webp", b"old", age_days=40)
+ write_media(demo_dir / ".feed.rss.next", b"staged", age_days=40)
+
+ output = io.StringIO()
+ result = cleanup_media(
+ feeds_dir=feeds_dir,
+ retention_days=25,
+ now=NOW,
+ dry_run=False,
+ output=output,
+ )
+
+ assert (demo_dir / "audio" / "current.mp3").exists()
+ assert (demo_dir / "images" / "full" / "current.webp").exists()
+ assert (demo_dir / "images" / "thumbs" / "current.jpg").exists()
+ assert not (demo_dir / "images" / "source" / "current.png").exists()
+ assert not (demo_dir / "video" / "old.mp4").exists()
+ assert not (demo_dir / "images" / "full" / "old.webp").exists()
+ assert (demo_dir / "files" / "fresh.pdf").exists()
+ assert (demo_dir / ".feed.rss.next").exists()
+ assert result.matched_files == 3
+ assert result.deleted_files == 3
+ assert result.bytes_deleted == len(b"source") + len(b"video") + len(b"old")
+ assert result.failures == 0
+ assert "dry_run=false" in output.getvalue()
+ assert "deleted_files=3" in output.getvalue()
+
+
+def test_cleanup_media_dry_run_reports_matches_without_deleting(tmp_path: Path) -> None:
+ feeds_dir = tmp_path / "feeds"
+ old_file = feeds_dir / "demo" / "audio" / "old.mp3"
+ write_media(old_file, b"audio", age_days=40)
+
+ result = cleanup_media(
+ feeds_dir=feeds_dir,
+ retention_days=25,
+ now=NOW,
+ dry_run=True,
+ output=io.StringIO(),
+ )
+
+ assert old_file.exists()
+ assert result.matched_files == 1
+ assert result.deleted_files == 0
+ assert result.bytes_deleted == 0
+ assert result.failures == 0
+
+
+def test_cleanup_media_uses_configured_media_dirs(tmp_path: Path) -> None:
+ feeds_dir = tmp_path / "feeds"
+ demo_dir = feeds_dir / "demo"
+ demo_dir.mkdir(parents=True)
+ (demo_dir / "feed.rss").write_text(
+ """
+
+
+ -
+
+
+
+
+
+""".strip(),
+ encoding="utf-8",
+ )
+ write_media(demo_dir / "audio-custom" / "current.mp3", b"current", age_days=40)
+ write_media(demo_dir / "audio-custom" / "old.mp3", b"old", age_days=40)
+ write_media(demo_dir / "videos-custom" / "current.mp4", b"video", age_days=40)
+ write_media(demo_dir / "audio" / "legacy.mp3", b"legacy", age_days=40)
+
+ result = cleanup_media(
+ feeds_dir=feeds_dir,
+ retention_days=25,
+ now=NOW,
+ media_dirs=("audio-custom", "videos-custom"),
+ output=io.StringIO(),
+ )
+
+ assert (demo_dir / "audio-custom" / "current.mp3").exists()
+ assert not (demo_dir / "audio-custom" / "old.mp3").exists()
+ assert (demo_dir / "videos-custom" / "current.mp4").exists()
+ assert (demo_dir / "audio" / "legacy.mp3").exists()
+ assert result.matched_files == 1
+ assert result.deleted_files == 1
+ assert result.failures == 0
+
+
+def test_cleanup_media_waits_for_active_crawl_media_lock(tmp_path: Path) -> None:
+ out_dir = tmp_path / "out"
+ feeds_dir = out_dir / "feeds"
+ old_file = feeds_dir / "demo" / "audio" / "old.mp3"
+ write_media(old_file, b"audio", age_days=40)
+
+ lock_path = out_dir / ".media-retention.lock"
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
+ started_path = tmp_path / "cleanup-started"
+ done_path = tmp_path / "cleanup-done"
+ script = """
+import io
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+from repub.cleanup import cleanup_media
+
+Path(sys.argv[2]).write_text("started", encoding="utf-8")
+cleanup_media(
+ feeds_dir=Path(sys.argv[1]),
+ retention_days=25,
+ now=datetime(2026, 5, 27, 12, 0, tzinfo=UTC),
+ output=io.StringIO(),
+)
+Path(sys.argv[3]).write_text("done", encoding="utf-8")
+"""
+
+ with lock_path.open("a", encoding="utf-8") as lock_file:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_SH)
+ process = subprocess.Popen(
+ [
+ sys.executable,
+ "-c",
+ script,
+ str(feeds_dir),
+ str(started_path),
+ str(done_path),
+ ],
+ cwd=Path.cwd(),
+ stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ text=True,
+ )
+ try:
+ wait_until(started_path)
+ time.sleep(0.5)
+ assert old_file.exists()
+ assert process.poll() is None
+ assert not done_path.exists()
+ finally:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+ stdout, stderr = process.communicate(timeout=5)
+ assert process.returncode == 0, stdout + stderr
+ assert not old_file.exists()
+ assert done_path.exists()
diff --git a/tests/test_entrypoint.py b/tests/test_entrypoint.py
index 87edd9b..7e43f3b 100644
--- a/tests/test_entrypoint.py
+++ b/tests/test_entrypoint.py
@@ -39,6 +39,102 @@ def test_parse_args_supports_dev_mode_flag() -> None:
assert args.dev_mode is True
+def test_parse_args_supports_cleanup_media_defaults() -> None:
+ command, args = parse_args(["cleanup-media"])
+
+ assert command == "cleanup-media"
+ assert args.config is None
+ assert args.feeds_dir is None
+ assert args.days == 25
+ assert args.dry_run is False
+
+
+def test_entrypoint_runs_cleanup_media(monkeypatch, tmp_path) -> None:
+ recorded: dict[str, object] = {}
+
+ class FakeResult:
+ failures = 0
+
+ def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
+ recorded["feeds_dir"] = feeds_dir
+ recorded["retention_days"] = retention_days
+ recorded["dry_run"] = dry_run
+ recorded["media_dirs"] = media_dirs
+ return FakeResult()
+
+ monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
+
+ exit_code = entrypoint(
+ [
+ "cleanup-media",
+ "--feeds-dir",
+ str(tmp_path / "feeds"),
+ "--days",
+ "10",
+ "--dry-run",
+ ]
+ )
+
+ assert exit_code == 0
+ assert recorded == {
+ "feeds_dir": tmp_path / "feeds",
+ "retention_days": 10,
+ "dry_run": True,
+ "media_dirs": ("images", "audio", "video", "files"),
+ }
+
+
+def test_entrypoint_cleanup_media_uses_configured_media_dirs(
+ monkeypatch, tmp_path
+) -> None:
+ config_path = tmp_path / "repub.toml"
+ config_path.write_text(
+ """
+out_dir = "mirror"
+
+[[feeds]]
+name = "Demo"
+slug = "demo"
+url = "https://source.example/feed.rss"
+
+[scrapy.settings]
+REPUBLISHER_IMAGE_DIR = "images-custom"
+REPUBLISHER_AUDIO_DIR = "audio-custom"
+REPUBLISHER_VIDEO_DIR = "videos-custom"
+REPUBLISHER_FILE_DIR = "files-custom"
+""".strip(),
+ encoding="utf-8",
+ )
+ recorded: dict[str, object] = {}
+
+ class FakeResult:
+ failures = 0
+
+ def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs):
+ recorded["feeds_dir"] = feeds_dir
+ recorded["retention_days"] = retention_days
+ recorded["dry_run"] = dry_run
+ recorded["media_dirs"] = media_dirs
+ return FakeResult()
+
+ monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media)
+
+ exit_code = entrypoint(["cleanup-media", "--config", str(config_path)])
+
+ assert exit_code == 0
+ assert recorded == {
+ "feeds_dir": tmp_path / "mirror" / "feeds",
+ "retention_days": 25,
+ "dry_run": False,
+ "media_dirs": (
+ "images-custom",
+ "audio-custom",
+ "videos-custom",
+ "files-custom",
+ ),
+ }
+
+
def test_parse_args_defaults_to_dev_mode_when_no_args() -> None:
command, args = parse_args([])
diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py
index 712a540..3bbb404 100644
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@@ -1,3 +1,5 @@
+import subprocess
+import sys
from pathlib import Path
import pytest
@@ -72,6 +74,66 @@ def test_main_publishes_staged_feed_after_successful_crawl(
assert not staged_path.exists()
+def test_main_holds_media_cleanup_lock_during_crawl(
+ monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+ out_dir = tmp_path / "out"
+ public_path = feed_output_path(out_dir=out_dir, feed_slug="demo")
+ staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo")
+ public_path.parent.mkdir(parents=True)
+ public_path.write_text("old\n", encoding="utf-8")
+ staged_path.write_text(VALID_FEED, encoding="utf-8")
+
+ def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int:
+ lock_path = out_dir.resolve() / ".media-retention.lock"
+ script = """
+import fcntl
+import sys
+from pathlib import Path
+
+lock_path = Path(sys.argv[1])
+lock_path.parent.mkdir(parents=True, exist_ok=True)
+with lock_path.open("a", encoding="utf-8") as lock_file:
+ try:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+ except BlockingIOError:
+ sys.exit(0)
+ else:
+ fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+ sys.exit(2)
+"""
+ completed = subprocess.run(
+ [sys.executable, "-c", script, str(lock_path)],
+ cwd=Path.cwd(),
+ capture_output=True,
+ check=False,
+ text=True,
+ )
+ assert completed.returncode == 0, completed.stdout + completed.stderr
+ return 0
+
+ _patch_worker_dependencies(
+ monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held
+ )
+
+ exit_code = job_runner_module.main(
+ [
+ "--job-id",
+ "1",
+ "--execution-id",
+ "2",
+ "--db-path",
+ str(tmp_path / "republisher.db"),
+ "--out-dir",
+ str(out_dir),
+ "--stats-path",
+ str(tmp_path / "stats.jsonl"),
+ ]
+ )
+
+ assert exit_code == 0
+
+
def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl(
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
) -> None:
@@ -137,7 +199,7 @@ def test_main_does_not_publish_staged_feed_after_failed_crawl(
def _patch_worker_dependencies(
- monkeypatch: pytest.MonkeyPatch, *, exit_code: int
+ monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None
) -> None:
monkeypatch.setattr(
job_runner_module,
@@ -161,5 +223,5 @@ def _patch_worker_dependencies(
monkeypatch.setattr(
job_runner_module,
"_run_crawl",
- lambda *, process, feed, spider_arguments: exit_code,
+ run_crawl or (lambda *, process, feed, spider_arguments: exit_code),
)