From 507074b80ef00f89b318a71b836c5a0654fee995 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Wed, 27 May 2026 13:04:47 +0200 Subject: [PATCH] Add media retention cleanup command --- README.md | 21 ++++ demo/README.md | 14 +++ demo/repub.toml | 5 + repub/cleanup.py | 188 ++++++++++++++++++++++++++++++++++++ repub/crawl.py | 6 +- repub/entrypoint.py | 82 +++++++++++++++- repub/job_runner.py | 96 ++++++++++--------- tests/test_cleanup.py | 200 +++++++++++++++++++++++++++++++++++++++ tests/test_entrypoint.py | 96 +++++++++++++++++++ tests/test_job_runner.py | 66 ++++++++++++- 10 files changed, 722 insertions(+), 52 deletions(-) create mode 100644 repub/cleanup.py create mode 100644 tests/test_cleanup.py diff --git a/README.md b/README.md index cab926d..b48c22a 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,27 @@ Operational notes: Reordering `REPUBLISHER_IMAGE` changes canonical feed image URLs. - Job logs and stats artifacts are written under `out/logs/`. +Media cleanup: + +- Published media can outlive the current feed when articles fall out of the + feed window. Use `cleanup-media` to delete old media files that are no longer + referenced by the latest published `feed.rss`. +- The default retention window is 25 days. Run a dry run first: + +```sh +uv run repub cleanup-media --feeds-dir out/feeds --days 25 --dry-run +``` + +- Remove `--dry-run` to delete matching files. The command protects media + referenced by the latest published feed and uses a lock to avoid racing with + active crawls. +- For config-driven deployments, pass the runtime config so cleanup uses the + configured `out_dir` and media directory names: + +```sh +uv run repub cleanup-media --config repub.toml --dry-run +``` + The legacy one-shot config-driven crawler is still available: ```sh diff --git a/demo/README.md b/demo/README.md index af4f0b8..652959e 100644 --- a/demo/README.md +++ b/demo/README.md @@ -30,6 +30,20 @@ variants are written under `images/thumbs/` inside each feed output directory. Edit the Scrapy settings in [`demo/repub.toml`](/home/abel/src/guardianproject/anynews/republisher-redux/demo/repub.toml) when a demo run needs to disable thumbnails or test a different profile set. +## Media Cleanup + +Published media can remain on disk after articles fall out of the current feed. +Run cleanup in dry-run mode first: + +```shell +uv run repub cleanup-media --config demo/repub.toml --dry-run +``` + +With `--config`, cleanup scans `demo/out/feeds/` and honors any +`REPUBLISHER_*_DIR` media directory overrides in the config. Remove `--dry-run` +to delete old unreferenced media. The default retention window is 25 days; use +`--days N` to override it. + ## Local File Feed `repub` already accepts absolute `file://` feed URIs. To point it at the demo fixture, generate an absolute URI like this from the repo root: diff --git a/demo/repub.toml b/demo/repub.toml index d829325..df8aa67 100644 --- a/demo/repub.toml +++ b/demo/repub.toml @@ -22,3 +22,8 @@ REPUBLISHER_FEED_URL = "https://mirror.example" # images plus JPEG thumbnails. # REPUBLISHER_IMAGE_NORMALIZE_ENABLED = true # REPUBLISHER_IMAGE_THUMBNAILS_ENABLED = true + +# Media cleanup can use this config: +# uv run repub cleanup-media --config demo/repub.toml --dry-run +# It scans out_dir/feeds, honors REPUBLISHER_*_DIR overrides, and defaults to a +# 25-day retention window for old media not referenced by the latest feed.rss. diff --git a/repub/cleanup.py b/repub/cleanup.py new file mode 100644 index 0000000..387c62a --- /dev/null +++ b/repub/cleanup.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import fcntl +import html +import re +import sys +from collections.abc import Iterator, Sequence +from contextlib import contextmanager +from dataclasses import dataclass +from datetime import UTC, datetime, timedelta +from pathlib import Path +from typing import TextIO +from urllib.parse import unquote, urlsplit + +DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files") +MEDIA_RETENTION_LOCK = ".media-retention.lock" + + +@dataclass +class CleanupResult: + scanned_root: Path + cutoff: datetime + dry_run: bool + matched_files: int = 0 + deleted_files: int = 0 + bytes_deleted: int = 0 + failures: int = 0 + + +def _bool_text(value: bool) -> str: + return "true" if value else "false" + + +def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]: + normalized = tuple( + dict.fromkeys( + normalized_dir + for media_dir in media_dirs + if (normalized_dir := media_dir.strip("/")) + ) + ) + if not normalized: + raise ValueError("media_dirs must include at least one media directory") + return normalized + + +def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]: + media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs) + return re.compile( + rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)""" + ) + + +@contextmanager +def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]: + out_dir = out_dir.resolve() + out_dir.mkdir(parents=True, exist_ok=True) + lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH + with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), lock_mode) + try: + yield + finally: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + +def _feed_dirs(feeds_dir: Path) -> list[Path]: + if not feeds_dir.exists(): + return [] + return sorted(path for path in feeds_dir.iterdir() if path.is_dir()) + + +def _referenced_media_paths( + feed_dir: Path, feed_body: str, media_dirs: Sequence[str] +) -> set[Path]: + protected: set[Path] = set() + slug = feed_dir.name + feed_prefix = f"/feeds/{slug}/" + feed_root = feed_dir.resolve() + media_dir_set = set(media_dirs) + for match in _feed_reference_re(media_dirs).finditer(feed_body): + reference = html.unescape(match.group(0)) + path = unquote(urlsplit(reference).path) + if path.startswith(feed_prefix): + relative_path = path.removeprefix(feed_prefix) + else: + relative_path = path.lstrip("/") + if relative_path.split("/", maxsplit=1)[0] not in media_dir_set: + continue + candidate = (feed_dir / relative_path).resolve() + if candidate.is_relative_to(feed_root): + protected.add(candidate) + return protected + + +def collect_protected_paths( + feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS +) -> set[Path]: + media_dirs = _normalize_media_dirs(media_dirs) + protected: set[Path] = set() + for feed_dir in _feed_dirs(feeds_dir): + feed_path = feed_dir / "feed.rss" + if not feed_path.exists(): + continue + protected.update( + _referenced_media_paths( + feed_dir, + feed_path.read_text(encoding="utf-8", errors="replace"), + media_dirs, + ) + ) + return protected + + +def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]: + files: list[Path] = [] + for feed_dir in _feed_dirs(feeds_dir): + for media_dir_name in media_dirs: + media_dir = feed_dir / media_dir_name + if not media_dir.exists(): + continue + files.extend(path for path in media_dir.rglob("*") if path.is_file()) + return sorted(files) + + +def cleanup_media( + *, + feeds_dir: Path, + retention_days: int = 25, + now: datetime | None = None, + dry_run: bool = False, + output: TextIO = sys.stdout, + media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS, +) -> CleanupResult: + if now is None: + now = datetime.now(UTC) + elif now.tzinfo is None: + now = now.replace(tzinfo=UTC) + + cutoff = now - timedelta(days=retention_days) + cutoff_timestamp = cutoff.timestamp() + feeds_dir = feeds_dir.resolve() + media_dirs = _normalize_media_dirs(media_dirs) + with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True): + protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs) + result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run) + + for path in _media_files(feeds_dir, media_dirs): + try: + stat = path.stat() + except OSError as error: + result.failures += 1 + print( + f"media cleanup: stat failed path={path} error={error}", + file=output, + ) + continue + if stat.st_mtime >= cutoff_timestamp: + continue + if path.resolve() in protected: + continue + result.matched_files += 1 + if dry_run: + continue + try: + path.unlink() + except OSError as error: + result.failures += 1 + print( + f"media cleanup: delete failed path={path} error={error}", + file=output, + ) + continue + result.deleted_files += 1 + result.bytes_deleted += stat.st_size + + print( + "media cleanup: " + f"dry_run={_bool_text(result.dry_run)} " + f"cutoff={result.cutoff.isoformat()} " + f"root={result.scanned_root} " + f"matched_files={result.matched_files} " + f"deleted_files={result.deleted_files} " + f"bytes_deleted={result.bytes_deleted} " + f"failures={result.failures}", + file=output, + ) + return result diff --git a/repub/crawl.py b/repub/crawl.py index 6f0d9f4..3c7f41b 100644 --- a/repub/crawl.py +++ b/repub/crawl.py @@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess from scrapy.settings import Settings from twisted.python.failure import Failure +from repub.cleanup import media_retention_lock from repub.config import ( FeedConfig, build_base_settings, @@ -103,8 +104,9 @@ def run_feeds( deferred.addCallbacks(handle_success, handle_error) deferred.addBoth(crawl_next) - crawl_next() - process.start(stop_after_crawl=False) + with media_retention_lock(out_dir=out_dir, exclusive=False): + crawl_next() + process.start(stop_after_crawl=False) return 1 if any(failure is not None for _, failure in results) else 0 diff --git a/repub/entrypoint.py b/repub/entrypoint.py index 1cc7932..4d0bc83 100644 --- a/repub/entrypoint.py +++ b/repub/entrypoint.py @@ -7,11 +7,21 @@ import os import signal import sys from contextlib import suppress +from pathlib import Path from hypercorn.asyncio import serve as hypercorn_serve from hypercorn.config import Config as HypercornConfig import repub.crawl as crawl_module +from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media +from repub.config import ( + AUDIO_DIR, + FILE_DIR, + IMAGE_DIR, + VIDEO_DIR, + build_base_settings, + load_config, +) from repub.web import SHUTDOWN_EVENT_KEY, create_app FeedNameFilter = crawl_module.FeedNameFilter @@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]: default="repub.toml", help="Path to runtime config TOML file", ) + + cleanup_parser = subparsers.add_parser( + "cleanup-media", + help="Delete old unreferenced published media", + ) + cleanup_parser.add_argument( + "-c", + "--config", + default=None, + help="Read output and media directory settings from runtime config TOML", + ) + cleanup_parser.add_argument( + "--feeds-dir", + default=None, + help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)", + ) + cleanup_parser.add_argument( + "--days", + type=int, + default=25, + help="Delete unreferenced media older than this many days", + ) + cleanup_parser.add_argument( + "--dry-run", + action="store_true", + help="Report cleanup matches without deleting files", + ) + if not raw_args: raw_args = ["serve", "--dev-mode"] elif raw_args[0] in {"-c", "--config"}: raw_args = ["crawl", *raw_args] - elif raw_args[0] not in {"serve", "crawl"}: + elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}: raw_args = ["serve", "--dev-mode", *raw_args] args = parser.parse_args(raw_args) @@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]: return command, args +def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]: + feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds") + media_dirs = DEFAULT_MEDIA_DIRS + if args.config is None: + return feeds_dir, media_dirs + + config = load_config(args.config) + settings = build_base_settings(config) + media_dirs = ( + str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)), + str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)), + str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)), + str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)), + ) + if args.feeds_dir is None: + feeds_dir = config.out_dir / "feeds" + return feeds_dir, media_dirs + + def _install_signal_handlers(stop_event: asyncio.Event) -> None: loop = asyncio.get_running_loop() @@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int: crawl_module.check_runtime = check_runtime return crawl_module.crawl_from_config(args.config) + if command == "cleanup-media": + try: + feeds_dir, media_dirs = _cleanup_config(args) + except FileNotFoundError as error: + missing_path = ( + Path(error.filename).expanduser() + if error.filename + else Path(args.config).expanduser() + ) + logger.error("Config file not found: %s", missing_path) + return 2 + except ValueError as error: + logger.error("Invalid config: %s", error) + return 2 + + result = cleanup_media( + feeds_dir=feeds_dir, + retention_days=args.days, + dry_run=bool(args.dry_run), + media_dirs=media_dirs, + ) + return 1 if result.failures else 0 + try: port = int(args.port) except ValueError: diff --git a/repub/job_runner.py b/repub/job_runner.py index 008bd15..97abcf0 100644 --- a/repub/job_runner.py +++ b/repub/job_runner.py @@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess from scrapy.statscollectors import StatsCollector from twisted.python.failure import Failure +from repub.cleanup import media_retention_lock from repub.config import ( FeedConfig, RepublisherConfig, @@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int: stats_path = Path(args.stats_path).resolve() log_path = stats_path.with_suffix(".log") - try: - feed = _resolve_feed( - source_config=source_config, - out_dir=out_dir, - log_path=log_path, - ) - process = CrawlerProcess( - _build_crawl_settings( - out_dir=out_dir, - feed=feed, - stats_path=stats_path, - convert_images=source_config.convert_images, - convert_video=source_config.convert_video, - feed_url=load_feed_url(), - ) - ) - print( - f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}", - flush=True, - ) - exit_code = _run_crawl( - process=process, - feed=feed, - spider_arguments=source_config.spider_arguments, - ) - except Exception as error: - print( - f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}", - flush=True, - ) - return 1 - - if stop_requested: - print( - f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request", - flush=True, - ) - return 130 - - if exit_code == 0: + with media_retention_lock(out_dir=out_dir, exclusive=False): try: - publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) + feed = _resolve_feed( + source_config=source_config, + out_dir=out_dir, + log_path=log_path, + ) + process = CrawlerProcess( + _build_crawl_settings( + out_dir=out_dir, + feed=feed, + stats_path=stats_path, + convert_images=source_config.convert_images, + convert_video=source_config.convert_video, + feed_url=load_feed_url(), + ) + ) + print( + f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}", + flush=True, + ) + exit_code = _run_crawl( + process=process, + feed=feed, + spider_arguments=source_config.spider_arguments, + ) except Exception as error: print( - f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}", + f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}", flush=True, ) return 1 - print( - f"worker[{args.job_id}:{args.execution_id}]: completed successfully", - flush=True, - ) - return exit_code + + if stop_requested: + print( + f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request", + flush=True, + ) + return 130 + + if exit_code == 0: + try: + publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug) + except Exception as error: + print( + f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}", + flush=True, + ) + return 1 + print( + f"worker[{args.job_id}:{args.execution_id}]: completed successfully", + flush=True, + ) + return exit_code def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig: diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py new file mode 100644 index 0000000..c5d9748 --- /dev/null +++ b/tests/test_cleanup.py @@ -0,0 +1,200 @@ +import fcntl +import io +import os +import subprocess +import sys +import time +from datetime import UTC, datetime, timedelta +from pathlib import Path + +from repub.cleanup import cleanup_media + +NOW = datetime(2026, 5, 27, 12, 0, tzinfo=UTC) + + +def write_media(path: Path, body: bytes, *, age_days: int) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(body) + timestamp = (NOW - timedelta(days=age_days)).timestamp() + os.utime(path, (timestamp, timestamp)) + + +def wait_until(path: Path, *, timeout: float = 5.0) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if path.exists(): + return + time.sleep(0.05) + raise AssertionError(f"timed out waiting for {path}") + + +def test_cleanup_media_deletes_old_unreferenced_media_and_protects_latest_feed_refs( + tmp_path: Path, +) -> None: + feeds_dir = tmp_path / "feeds" + demo_dir = feeds_dir / "demo" + demo_dir.mkdir(parents=True) + (demo_dir / "feed.rss").write_text( + """ + + + + + + ]]> + + + +""".strip(), + encoding="utf-8", + ) + write_media(demo_dir / "audio" / "current.mp3", b"audio", age_days=40) + write_media(demo_dir / "images" / "full" / "current.webp", b"webp", age_days=40) + write_media(demo_dir / "images" / "thumbs" / "current.jpg", b"jpg", age_days=40) + write_media(demo_dir / "images" / "source" / "current.png", b"source", age_days=40) + write_media(demo_dir / "video" / "old.mp4", b"video", age_days=40) + write_media(demo_dir / "files" / "fresh.pdf", b"fresh", age_days=2) + write_media(demo_dir / "images" / "full" / "old.webp", b"old", age_days=40) + write_media(demo_dir / ".feed.rss.next", b"staged", age_days=40) + + output = io.StringIO() + result = cleanup_media( + feeds_dir=feeds_dir, + retention_days=25, + now=NOW, + dry_run=False, + output=output, + ) + + assert (demo_dir / "audio" / "current.mp3").exists() + assert (demo_dir / "images" / "full" / "current.webp").exists() + assert (demo_dir / "images" / "thumbs" / "current.jpg").exists() + assert not (demo_dir / "images" / "source" / "current.png").exists() + assert not (demo_dir / "video" / "old.mp4").exists() + assert not (demo_dir / "images" / "full" / "old.webp").exists() + assert (demo_dir / "files" / "fresh.pdf").exists() + assert (demo_dir / ".feed.rss.next").exists() + assert result.matched_files == 3 + assert result.deleted_files == 3 + assert result.bytes_deleted == len(b"source") + len(b"video") + len(b"old") + assert result.failures == 0 + assert "dry_run=false" in output.getvalue() + assert "deleted_files=3" in output.getvalue() + + +def test_cleanup_media_dry_run_reports_matches_without_deleting(tmp_path: Path) -> None: + feeds_dir = tmp_path / "feeds" + old_file = feeds_dir / "demo" / "audio" / "old.mp3" + write_media(old_file, b"audio", age_days=40) + + result = cleanup_media( + feeds_dir=feeds_dir, + retention_days=25, + now=NOW, + dry_run=True, + output=io.StringIO(), + ) + + assert old_file.exists() + assert result.matched_files == 1 + assert result.deleted_files == 0 + assert result.bytes_deleted == 0 + assert result.failures == 0 + + +def test_cleanup_media_uses_configured_media_dirs(tmp_path: Path) -> None: + feeds_dir = tmp_path / "feeds" + demo_dir = feeds_dir / "demo" + demo_dir.mkdir(parents=True) + (demo_dir / "feed.rss").write_text( + """ + + + + + + + + +""".strip(), + encoding="utf-8", + ) + write_media(demo_dir / "audio-custom" / "current.mp3", b"current", age_days=40) + write_media(demo_dir / "audio-custom" / "old.mp3", b"old", age_days=40) + write_media(demo_dir / "videos-custom" / "current.mp4", b"video", age_days=40) + write_media(demo_dir / "audio" / "legacy.mp3", b"legacy", age_days=40) + + result = cleanup_media( + feeds_dir=feeds_dir, + retention_days=25, + now=NOW, + media_dirs=("audio-custom", "videos-custom"), + output=io.StringIO(), + ) + + assert (demo_dir / "audio-custom" / "current.mp3").exists() + assert not (demo_dir / "audio-custom" / "old.mp3").exists() + assert (demo_dir / "videos-custom" / "current.mp4").exists() + assert (demo_dir / "audio" / "legacy.mp3").exists() + assert result.matched_files == 1 + assert result.deleted_files == 1 + assert result.failures == 0 + + +def test_cleanup_media_waits_for_active_crawl_media_lock(tmp_path: Path) -> None: + out_dir = tmp_path / "out" + feeds_dir = out_dir / "feeds" + old_file = feeds_dir / "demo" / "audio" / "old.mp3" + write_media(old_file, b"audio", age_days=40) + + lock_path = out_dir / ".media-retention.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + started_path = tmp_path / "cleanup-started" + done_path = tmp_path / "cleanup-done" + script = """ +import io +import sys +from datetime import UTC, datetime +from pathlib import Path + +from repub.cleanup import cleanup_media + +Path(sys.argv[2]).write_text("started", encoding="utf-8") +cleanup_media( + feeds_dir=Path(sys.argv[1]), + retention_days=25, + now=datetime(2026, 5, 27, 12, 0, tzinfo=UTC), + output=io.StringIO(), +) +Path(sys.argv[3]).write_text("done", encoding="utf-8") +""" + + with lock_path.open("a", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_SH) + process = subprocess.Popen( + [ + sys.executable, + "-c", + script, + str(feeds_dir), + str(started_path), + str(done_path), + ], + cwd=Path.cwd(), + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + try: + wait_until(started_path) + time.sleep(0.5) + assert old_file.exists() + assert process.poll() is None + assert not done_path.exists() + finally: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + stdout, stderr = process.communicate(timeout=5) + assert process.returncode == 0, stdout + stderr + assert not old_file.exists() + assert done_path.exists() diff --git a/tests/test_entrypoint.py b/tests/test_entrypoint.py index 87edd9b..7e43f3b 100644 --- a/tests/test_entrypoint.py +++ b/tests/test_entrypoint.py @@ -39,6 +39,102 @@ def test_parse_args_supports_dev_mode_flag() -> None: assert args.dev_mode is True +def test_parse_args_supports_cleanup_media_defaults() -> None: + command, args = parse_args(["cleanup-media"]) + + assert command == "cleanup-media" + assert args.config is None + assert args.feeds_dir is None + assert args.days == 25 + assert args.dry_run is False + + +def test_entrypoint_runs_cleanup_media(monkeypatch, tmp_path) -> None: + recorded: dict[str, object] = {} + + class FakeResult: + failures = 0 + + def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs): + recorded["feeds_dir"] = feeds_dir + recorded["retention_days"] = retention_days + recorded["dry_run"] = dry_run + recorded["media_dirs"] = media_dirs + return FakeResult() + + monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media) + + exit_code = entrypoint( + [ + "cleanup-media", + "--feeds-dir", + str(tmp_path / "feeds"), + "--days", + "10", + "--dry-run", + ] + ) + + assert exit_code == 0 + assert recorded == { + "feeds_dir": tmp_path / "feeds", + "retention_days": 10, + "dry_run": True, + "media_dirs": ("images", "audio", "video", "files"), + } + + +def test_entrypoint_cleanup_media_uses_configured_media_dirs( + monkeypatch, tmp_path +) -> None: + config_path = tmp_path / "repub.toml" + config_path.write_text( + """ +out_dir = "mirror" + +[[feeds]] +name = "Demo" +slug = "demo" +url = "https://source.example/feed.rss" + +[scrapy.settings] +REPUBLISHER_IMAGE_DIR = "images-custom" +REPUBLISHER_AUDIO_DIR = "audio-custom" +REPUBLISHER_VIDEO_DIR = "videos-custom" +REPUBLISHER_FILE_DIR = "files-custom" +""".strip(), + encoding="utf-8", + ) + recorded: dict[str, object] = {} + + class FakeResult: + failures = 0 + + def fake_cleanup_media(*, feeds_dir, retention_days, dry_run, media_dirs): + recorded["feeds_dir"] = feeds_dir + recorded["retention_days"] = retention_days + recorded["dry_run"] = dry_run + recorded["media_dirs"] = media_dirs + return FakeResult() + + monkeypatch.setattr("repub.entrypoint.cleanup_media", fake_cleanup_media) + + exit_code = entrypoint(["cleanup-media", "--config", str(config_path)]) + + assert exit_code == 0 + assert recorded == { + "feeds_dir": tmp_path / "mirror" / "feeds", + "retention_days": 25, + "dry_run": False, + "media_dirs": ( + "images-custom", + "audio-custom", + "videos-custom", + "files-custom", + ), + } + + def test_parse_args_defaults_to_dev_mode_when_no_args() -> None: command, args = parse_args([]) diff --git a/tests/test_job_runner.py b/tests/test_job_runner.py index 712a540..3bbb404 100644 --- a/tests/test_job_runner.py +++ b/tests/test_job_runner.py @@ -1,3 +1,5 @@ +import subprocess +import sys from pathlib import Path import pytest @@ -72,6 +74,66 @@ def test_main_publishes_staged_feed_after_successful_crawl( assert not staged_path.exists() +def test_main_holds_media_cleanup_lock_during_crawl( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + out_dir = tmp_path / "out" + public_path = feed_output_path(out_dir=out_dir, feed_slug="demo") + staged_path = staged_feed_output_path(out_dir=out_dir, feed_slug="demo") + public_path.parent.mkdir(parents=True) + public_path.write_text("old\n", encoding="utf-8") + staged_path.write_text(VALID_FEED, encoding="utf-8") + + def assert_media_lock_is_held(*, process, feed, spider_arguments) -> int: + lock_path = out_dir.resolve() / ".media-retention.lock" + script = """ +import fcntl +import sys +from pathlib import Path + +lock_path = Path(sys.argv[1]) +lock_path.parent.mkdir(parents=True, exist_ok=True) +with lock_path.open("a", encoding="utf-8") as lock_file: + try: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + sys.exit(0) + else: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + sys.exit(2) +""" + completed = subprocess.run( + [sys.executable, "-c", script, str(lock_path)], + cwd=Path.cwd(), + capture_output=True, + check=False, + text=True, + ) + assert completed.returncode == 0, completed.stdout + completed.stderr + return 0 + + _patch_worker_dependencies( + monkeypatch, exit_code=0, run_crawl=assert_media_lock_is_held + ) + + exit_code = job_runner_module.main( + [ + "--job-id", + "1", + "--execution-id", + "2", + "--db-path", + str(tmp_path / "republisher.db"), + "--out-dir", + str(out_dir), + "--stats-path", + str(tmp_path / "stats.jsonl"), + ] + ) + + assert exit_code == 0 + + def test_main_does_not_publish_unusable_staged_feed_after_successful_crawl( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: @@ -137,7 +199,7 @@ def test_main_does_not_publish_staged_feed_after_failed_crawl( def _patch_worker_dependencies( - monkeypatch: pytest.MonkeyPatch, *, exit_code: int + monkeypatch: pytest.MonkeyPatch, *, exit_code: int, run_crawl=None ) -> None: monkeypatch.setattr( job_runner_module, @@ -161,5 +223,5 @@ def _patch_worker_dependencies( monkeypatch.setattr( job_runner_module, "_run_crawl", - lambda *, process, feed, spider_arguments: exit_code, + run_crawl or (lambda *, process, feed, spider_arguments: exit_code), )