Add media retention cleanup command

2026-05-27 13:04:47 +02:00 · 2026-05-27 13:04:47 +02:00 · 507074b80e
commit 507074b80e
parent 3b6503a6ed
10 changed files with 722 additions and 52 deletions
--- a/repub/cleanup.py
+++ b/repub/cleanup.py
@ -0,0 +1,188 @@
+from __future__ import annotations
+
+import fcntl
+import html
+import re
+import sys
+from collections.abc import Iterator, Sequence
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import UTC, datetime, timedelta
+from pathlib import Path
+from typing import TextIO
+from urllib.parse import unquote, urlsplit
+
+DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files")
+MEDIA_RETENTION_LOCK = ".media-retention.lock"
+
+
+@dataclass
+class CleanupResult:
+    scanned_root: Path
+    cutoff: datetime
+    dry_run: bool
+    matched_files: int = 0
+    deleted_files: int = 0
+    bytes_deleted: int = 0
+    failures: int = 0
+
+
+def _bool_text(value: bool) -> str:
+    return "true" if value else "false"
+
+
+def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]:
+    normalized = tuple(
+        dict.fromkeys(
+            normalized_dir
+            for media_dir in media_dirs
+            if (normalized_dir := media_dir.strip("/"))
+        )
+    )
+    if not normalized:
+        raise ValueError("media_dirs must include at least one media directory")
+    return normalized
+
+
+def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]:
+    media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs)
+    return re.compile(
+        rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)"""
+    )
+
+
+@contextmanager
+def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]:
+    out_dir = out_dir.resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
+    with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file:
+        fcntl.flock(lock_file.fileno(), lock_mode)
+        try:
+            yield
+        finally:
+            fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
+
+
+def _feed_dirs(feeds_dir: Path) -> list[Path]:
+    if not feeds_dir.exists():
+        return []
+    return sorted(path for path in feeds_dir.iterdir() if path.is_dir())
+
+
+def _referenced_media_paths(
+    feed_dir: Path, feed_body: str, media_dirs: Sequence[str]
+) -> set[Path]:
+    protected: set[Path] = set()
+    slug = feed_dir.name
+    feed_prefix = f"/feeds/{slug}/"
+    feed_root = feed_dir.resolve()
+    media_dir_set = set(media_dirs)
+    for match in _feed_reference_re(media_dirs).finditer(feed_body):
+        reference = html.unescape(match.group(0))
+        path = unquote(urlsplit(reference).path)
+        if path.startswith(feed_prefix):
+            relative_path = path.removeprefix(feed_prefix)
+        else:
+            relative_path = path.lstrip("/")
+        if relative_path.split("/", maxsplit=1)[0] not in media_dir_set:
+            continue
+        candidate = (feed_dir / relative_path).resolve()
+        if candidate.is_relative_to(feed_root):
+            protected.add(candidate)
+    return protected
+
+
+def collect_protected_paths(
+    feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS
+) -> set[Path]:
+    media_dirs = _normalize_media_dirs(media_dirs)
+    protected: set[Path] = set()
+    for feed_dir in _feed_dirs(feeds_dir):
+        feed_path = feed_dir / "feed.rss"
+        if not feed_path.exists():
+            continue
+        protected.update(
+            _referenced_media_paths(
+                feed_dir,
+                feed_path.read_text(encoding="utf-8", errors="replace"),
+                media_dirs,
+            )
+        )
+    return protected
+
+
+def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]:
+    files: list[Path] = []
+    for feed_dir in _feed_dirs(feeds_dir):
+        for media_dir_name in media_dirs:
+            media_dir = feed_dir / media_dir_name
+            if not media_dir.exists():
+                continue
+            files.extend(path for path in media_dir.rglob("*") if path.is_file())
+    return sorted(files)
+
+
+def cleanup_media(
+    *,
+    feeds_dir: Path,
+    retention_days: int = 25,
+    now: datetime | None = None,
+    dry_run: bool = False,
+    output: TextIO = sys.stdout,
+    media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS,
+) -> CleanupResult:
+    if now is None:
+        now = datetime.now(UTC)
+    elif now.tzinfo is None:
+        now = now.replace(tzinfo=UTC)
+
+    cutoff = now - timedelta(days=retention_days)
+    cutoff_timestamp = cutoff.timestamp()
+    feeds_dir = feeds_dir.resolve()
+    media_dirs = _normalize_media_dirs(media_dirs)
+    with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True):
+        protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs)
+        result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run)
+
+        for path in _media_files(feeds_dir, media_dirs):
+            try:
+                stat = path.stat()
+            except OSError as error:
+                result.failures += 1
+                print(
+                    f"media cleanup: stat failed path={path} error={error}",
+                    file=output,
+                )
+                continue
+            if stat.st_mtime >= cutoff_timestamp:
+                continue
+            if path.resolve() in protected:
+                continue
+            result.matched_files += 1
+            if dry_run:
+                continue
+            try:
+                path.unlink()
+            except OSError as error:
+                result.failures += 1
+                print(
+                    f"media cleanup: delete failed path={path} error={error}",
+                    file=output,
+                )
+                continue
+            result.deleted_files += 1
+            result.bytes_deleted += stat.st_size
+
+        print(
+            "media cleanup: "
+            f"dry_run={_bool_text(result.dry_run)} "
+            f"cutoff={result.cutoff.isoformat()} "
+            f"root={result.scanned_root} "
+            f"matched_files={result.matched_files} "
+            f"deleted_files={result.deleted_files} "
+            f"bytes_deleted={result.bytes_deleted} "
+            f"failures={result.failures}",
+            file=output,
+        )
+    return result
--- a/repub/crawl.py
+++ b/repub/crawl.py
@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess
 from scrapy.settings import Settings
 from twisted.python.failure import Failure

+from repub.cleanup import media_retention_lock
 from repub.config import (
    FeedConfig,
    build_base_settings,
@ -103,8 +104,9 @@ def run_feeds(
        deferred.addCallbacks(handle_success, handle_error)
        deferred.addBoth(crawl_next)

-    crawl_next()
-    process.start(stop_after_crawl=False)
+    with media_retention_lock(out_dir=out_dir, exclusive=False):
+        crawl_next()
+        process.start(stop_after_crawl=False)

    return 1 if any(failure is not None for _, failure in results) else 0

--- a/repub/entrypoint.py
+++ b/repub/entrypoint.py
@ -7,11 +7,21 @@ import os
 import signal
 import sys
 from contextlib import suppress
+from pathlib import Path

 from hypercorn.asyncio import serve as hypercorn_serve
 from hypercorn.config import Config as HypercornConfig

 import repub.crawl as crawl_module
+from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media
+from repub.config import (
+    AUDIO_DIR,
+    FILE_DIR,
+    IMAGE_DIR,
+    VIDEO_DIR,
+    build_base_settings,
+    load_config,
+)
 from repub.web import SHUTDOWN_EVENT_KEY, create_app

 FeedNameFilter = crawl_module.FeedNameFilter
@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
        default="repub.toml",
        help="Path to runtime config TOML file",
    )
+
+    cleanup_parser = subparsers.add_parser(
+        "cleanup-media",
+        help="Delete old unreferenced published media",
+    )
+    cleanup_parser.add_argument(
+        "-c",
+        "--config",
+        default=None,
+        help="Read output and media directory settings from runtime config TOML",
+    )
+    cleanup_parser.add_argument(
+        "--feeds-dir",
+        default=None,
+        help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
+    )
+    cleanup_parser.add_argument(
+        "--days",
+        type=int,
+        default=25,
+        help="Delete unreferenced media older than this many days",
+    )
+    cleanup_parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Report cleanup matches without deleting files",
+    )
+
    if not raw_args:
        raw_args = ["serve", "--dev-mode"]
    elif raw_args[0] in {"-c", "--config"}:
        raw_args = ["crawl", *raw_args]
-    elif raw_args[0] not in {"serve", "crawl"}:
+    elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}:
        raw_args = ["serve", "--dev-mode", *raw_args]

    args = parser.parse_args(raw_args)
@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
    return command, args


+def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
+    feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
+    media_dirs = DEFAULT_MEDIA_DIRS
+    if args.config is None:
+        return feeds_dir, media_dirs
+
+    config = load_config(args.config)
+    settings = build_base_settings(config)
+    media_dirs = (
+        str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)),
+        str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)),
+        str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)),
+        str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)),
+    )
+    if args.feeds_dir is None:
+        feeds_dir = config.out_dir / "feeds"
+    return feeds_dir, media_dirs
+
+
 def _install_signal_handlers(stop_event: asyncio.Event) -> None:
    loop = asyncio.get_running_loop()

@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int:
        crawl_module.check_runtime = check_runtime
        return crawl_module.crawl_from_config(args.config)

+    if command == "cleanup-media":
+        try:
+            feeds_dir, media_dirs = _cleanup_config(args)
+        except FileNotFoundError as error:
+            missing_path = (
+                Path(error.filename).expanduser()
+                if error.filename
+                else Path(args.config).expanduser()
+            )
+            logger.error("Config file not found: %s", missing_path)
+            return 2
+        except ValueError as error:
+            logger.error("Invalid config: %s", error)
+            return 2
+
+        result = cleanup_media(
+            feeds_dir=feeds_dir,
+            retention_days=args.days,
+            dry_run=bool(args.dry_run),
+            media_dirs=media_dirs,
+        )
+        return 1 if result.failures else 0
+
    try:
        port = int(args.port)
    except ValueError:
--- a/repub/job_runner.py
+++ b/repub/job_runner.py
@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
 from scrapy.statscollectors import StatsCollector
 from twisted.python.failure import Failure

+from repub.cleanup import media_retention_lock
 from repub.config import (
    FeedConfig,
    RepublisherConfig,
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
    stats_path = Path(args.stats_path).resolve()
    log_path = stats_path.with_suffix(".log")

-    try:
-        feed = _resolve_feed(
-            source_config=source_config,
-            out_dir=out_dir,
-            log_path=log_path,
-        )
-        process = CrawlerProcess(
-            _build_crawl_settings(
-                out_dir=out_dir,
-                feed=feed,
-                stats_path=stats_path,
-                convert_images=source_config.convert_images,
-                convert_video=source_config.convert_video,
-                feed_url=load_feed_url(),
-            )
-        )
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
-            flush=True,
-        )
-        exit_code = _run_crawl(
-            process=process,
-            feed=feed,
-            spider_arguments=source_config.spider_arguments,
-        )
-    except Exception as error:
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
-            flush=True,
-        )
-        return 1
-
-    if stop_requested:
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
-            flush=True,
-        )
-        return 130
-
-    if exit_code == 0:
+    with media_retention_lock(out_dir=out_dir, exclusive=False):
        try:
-            publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            feed = _resolve_feed(
+                source_config=source_config,
+                out_dir=out_dir,
+                log_path=log_path,
+            )
+            process = CrawlerProcess(
+                _build_crawl_settings(
+                    out_dir=out_dir,
+                    feed=feed,
+                    stats_path=stats_path,
+                    convert_images=source_config.convert_images,
+                    convert_video=source_config.convert_video,
+                    feed_url=load_feed_url(),
+                )
+            )
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
+                flush=True,
+            )
+            exit_code = _run_crawl(
+                process=process,
+                feed=feed,
+                spider_arguments=source_config.spider_arguments,
+            )
        except Exception as error:
            print(
-                f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
                flush=True,
            )
            return 1
-        print(
-            f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
-            flush=True,
-        )
-    return exit_code
+
+        if stop_requested:
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
+                flush=True,
+            )
+            return 130
+
+        if exit_code == 0:
+            try:
+                publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
+            except Exception as error:
+                print(
+                    f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
+                    flush=True,
+                )
+                return 1
+            print(
+                f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
+                flush=True,
+            )
+        return exit_code


 def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig: