Add media retention cleanup command
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-05-27 13:04:47 +02:00
parent 3b6503a6ed
commit 507074b80e
10 changed files with 722 additions and 52 deletions

188
repub/cleanup.py Normal file
View file

@ -0,0 +1,188 @@
from __future__ import annotations
import fcntl
import html
import re
import sys
from collections.abc import Iterator, Sequence
from contextlib import contextmanager
from dataclasses import dataclass
from datetime import UTC, datetime, timedelta
from pathlib import Path
from typing import TextIO
from urllib.parse import unquote, urlsplit
DEFAULT_MEDIA_DIRS = ("images", "audio", "video", "files")
MEDIA_RETENTION_LOCK = ".media-retention.lock"
@dataclass
class CleanupResult:
scanned_root: Path
cutoff: datetime
dry_run: bool
matched_files: int = 0
deleted_files: int = 0
bytes_deleted: int = 0
failures: int = 0
def _bool_text(value: bool) -> str:
return "true" if value else "false"
def _normalize_media_dirs(media_dirs: Sequence[str]) -> tuple[str, ...]:
normalized = tuple(
dict.fromkeys(
normalized_dir
for media_dir in media_dirs
if (normalized_dir := media_dir.strip("/"))
)
)
if not normalized:
raise ValueError("media_dirs must include at least one media directory")
return normalized
def _feed_reference_re(media_dirs: Sequence[str]) -> re.Pattern[str]:
media_names = "|".join(re.escape(media_dir) for media_dir in media_dirs)
return re.compile(
rf"""(?:https?://[^"'<>\s,]+|/?(?:feeds/[^/"'<>\s,]+/)?(?:{media_names})/[^"'<>\s,]+)"""
)
@contextmanager
def media_retention_lock(*, out_dir: Path, exclusive: bool) -> Iterator[None]:
out_dir = out_dir.resolve()
out_dir.mkdir(parents=True, exist_ok=True)
lock_mode = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
with (out_dir / MEDIA_RETENTION_LOCK).open("a", encoding="utf-8") as lock_file:
fcntl.flock(lock_file.fileno(), lock_mode)
try:
yield
finally:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
def _feed_dirs(feeds_dir: Path) -> list[Path]:
if not feeds_dir.exists():
return []
return sorted(path for path in feeds_dir.iterdir() if path.is_dir())
def _referenced_media_paths(
feed_dir: Path, feed_body: str, media_dirs: Sequence[str]
) -> set[Path]:
protected: set[Path] = set()
slug = feed_dir.name
feed_prefix = f"/feeds/{slug}/"
feed_root = feed_dir.resolve()
media_dir_set = set(media_dirs)
for match in _feed_reference_re(media_dirs).finditer(feed_body):
reference = html.unescape(match.group(0))
path = unquote(urlsplit(reference).path)
if path.startswith(feed_prefix):
relative_path = path.removeprefix(feed_prefix)
else:
relative_path = path.lstrip("/")
if relative_path.split("/", maxsplit=1)[0] not in media_dir_set:
continue
candidate = (feed_dir / relative_path).resolve()
if candidate.is_relative_to(feed_root):
protected.add(candidate)
return protected
def collect_protected_paths(
feeds_dir: Path, media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS
) -> set[Path]:
media_dirs = _normalize_media_dirs(media_dirs)
protected: set[Path] = set()
for feed_dir in _feed_dirs(feeds_dir):
feed_path = feed_dir / "feed.rss"
if not feed_path.exists():
continue
protected.update(
_referenced_media_paths(
feed_dir,
feed_path.read_text(encoding="utf-8", errors="replace"),
media_dirs,
)
)
return protected
def _media_files(feeds_dir: Path, media_dirs: Sequence[str]) -> list[Path]:
files: list[Path] = []
for feed_dir in _feed_dirs(feeds_dir):
for media_dir_name in media_dirs:
media_dir = feed_dir / media_dir_name
if not media_dir.exists():
continue
files.extend(path for path in media_dir.rglob("*") if path.is_file())
return sorted(files)
def cleanup_media(
*,
feeds_dir: Path,
retention_days: int = 25,
now: datetime | None = None,
dry_run: bool = False,
output: TextIO = sys.stdout,
media_dirs: Sequence[str] = DEFAULT_MEDIA_DIRS,
) -> CleanupResult:
if now is None:
now = datetime.now(UTC)
elif now.tzinfo is None:
now = now.replace(tzinfo=UTC)
cutoff = now - timedelta(days=retention_days)
cutoff_timestamp = cutoff.timestamp()
feeds_dir = feeds_dir.resolve()
media_dirs = _normalize_media_dirs(media_dirs)
with media_retention_lock(out_dir=feeds_dir.parent, exclusive=True):
protected = collect_protected_paths(feeds_dir, media_dirs=media_dirs)
result = CleanupResult(scanned_root=feeds_dir, cutoff=cutoff, dry_run=dry_run)
for path in _media_files(feeds_dir, media_dirs):
try:
stat = path.stat()
except OSError as error:
result.failures += 1
print(
f"media cleanup: stat failed path={path} error={error}",
file=output,
)
continue
if stat.st_mtime >= cutoff_timestamp:
continue
if path.resolve() in protected:
continue
result.matched_files += 1
if dry_run:
continue
try:
path.unlink()
except OSError as error:
result.failures += 1
print(
f"media cleanup: delete failed path={path} error={error}",
file=output,
)
continue
result.deleted_files += 1
result.bytes_deleted += stat.st_size
print(
"media cleanup: "
f"dry_run={_bool_text(result.dry_run)} "
f"cutoff={result.cutoff.isoformat()} "
f"root={result.scanned_root} "
f"matched_files={result.matched_files} "
f"deleted_files={result.deleted_files} "
f"bytes_deleted={result.bytes_deleted} "
f"failures={result.failures}",
file=output,
)
return result

View file

@ -7,6 +7,7 @@ from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.settings import Settings
from twisted.python.failure import Failure
from repub.cleanup import media_retention_lock
from repub.config import (
FeedConfig,
build_base_settings,
@ -103,8 +104,9 @@ def run_feeds(
deferred.addCallbacks(handle_success, handle_error)
deferred.addBoth(crawl_next)
crawl_next()
process.start(stop_after_crawl=False)
with media_retention_lock(out_dir=out_dir, exclusive=False):
crawl_next()
process.start(stop_after_crawl=False)
return 1 if any(failure is not None for _, failure in results) else 0

View file

@ -7,11 +7,21 @@ import os
import signal
import sys
from contextlib import suppress
from pathlib import Path
from hypercorn.asyncio import serve as hypercorn_serve
from hypercorn.config import Config as HypercornConfig
import repub.crawl as crawl_module
from repub.cleanup import DEFAULT_MEDIA_DIRS, cleanup_media
from repub.config import (
AUDIO_DIR,
FILE_DIR,
IMAGE_DIR,
VIDEO_DIR,
build_base_settings,
load_config,
)
from repub.web import SHUTDOWN_EVENT_KEY, create_app
FeedNameFilter = crawl_module.FeedNameFilter
@ -61,11 +71,39 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
default="repub.toml",
help="Path to runtime config TOML file",
)
cleanup_parser = subparsers.add_parser(
"cleanup-media",
help="Delete old unreferenced published media",
)
cleanup_parser.add_argument(
"-c",
"--config",
default=None,
help="Read output and media directory settings from runtime config TOML",
)
cleanup_parser.add_argument(
"--feeds-dir",
default=None,
help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
)
cleanup_parser.add_argument(
"--days",
type=int,
default=25,
help="Delete unreferenced media older than this many days",
)
cleanup_parser.add_argument(
"--dry-run",
action="store_true",
help="Report cleanup matches without deleting files",
)
if not raw_args:
raw_args = ["serve", "--dev-mode"]
elif raw_args[0] in {"-c", "--config"}:
raw_args = ["crawl", *raw_args]
elif raw_args[0] not in {"serve", "crawl"}:
elif raw_args[0] not in {"serve", "crawl", "cleanup-media"}:
raw_args = ["serve", "--dev-mode", *raw_args]
args = parser.parse_args(raw_args)
@ -73,6 +111,25 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
return command, args
def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
media_dirs = DEFAULT_MEDIA_DIRS
if args.config is None:
return feeds_dir, media_dirs
config = load_config(args.config)
settings = build_base_settings(config)
media_dirs = (
str(settings.get("REPUBLISHER_IMAGE_DIR", IMAGE_DIR)),
str(settings.get("REPUBLISHER_AUDIO_DIR", AUDIO_DIR)),
str(settings.get("REPUBLISHER_VIDEO_DIR", VIDEO_DIR)),
str(settings.get("REPUBLISHER_FILE_DIR", FILE_DIR)),
)
if args.feeds_dir is None:
feeds_dir = config.out_dir / "feeds"
return feeds_dir, media_dirs
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
loop = asyncio.get_running_loop()
@ -116,6 +173,29 @@ def entrypoint(argv: list[str] | None = None) -> int:
crawl_module.check_runtime = check_runtime
return crawl_module.crawl_from_config(args.config)
if command == "cleanup-media":
try:
feeds_dir, media_dirs = _cleanup_config(args)
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
if error.filename
else Path(args.config).expanduser()
)
logger.error("Config file not found: %s", missing_path)
return 2
except ValueError as error:
logger.error("Invalid config: %s", error)
return 2
result = cleanup_media(
feeds_dir=feeds_dir,
retention_days=args.days,
dry_run=bool(args.dry_run),
media_dirs=media_dirs,
)
return 1 if result.failures else 0
try:
port = int(args.port)
except ValueError:

View file

@ -14,6 +14,7 @@ from scrapy.crawler import CrawlerProcess
from scrapy.statscollectors import StatsCollector
from twisted.python.failure import Failure
from repub.cleanup import media_retention_lock
from repub.config import (
FeedConfig,
RepublisherConfig,
@ -260,59 +261,60 @@ def main(argv: list[str] | None = None) -> int:
stats_path = Path(args.stats_path).resolve()
log_path = stats_path.with_suffix(".log")
try:
feed = _resolve_feed(
source_config=source_config,
out_dir=out_dir,
log_path=log_path,
)
process = CrawlerProcess(
_build_crawl_settings(
out_dir=out_dir,
feed=feed,
stats_path=stats_path,
convert_images=source_config.convert_images,
convert_video=source_config.convert_video,
feed_url=load_feed_url(),
)
)
print(
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
flush=True,
)
exit_code = _run_crawl(
process=process,
feed=feed,
spider_arguments=source_config.spider_arguments,
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
if stop_requested:
print(
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
flush=True,
)
return 130
if exit_code == 0:
with media_retention_lock(out_dir=out_dir, exclusive=False):
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
feed = _resolve_feed(
source_config=source_config,
out_dir=out_dir,
log_path=log_path,
)
process = CrawlerProcess(
_build_crawl_settings(
out_dir=out_dir,
feed=feed,
stats_path=stats_path,
convert_images=source_config.convert_images,
convert_video=source_config.convert_video,
feed_url=load_feed_url(),
)
)
print(
f"worker[{args.job_id}:{args.execution_id}]: starting crawl for {source_config.source_slug}",
flush=True,
)
exit_code = _run_crawl(
process=process,
feed=feed,
spider_arguments=source_config.spider_arguments,
)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
f"worker[{args.job_id}:{args.execution_id}]: crawl failed: {error}",
flush=True,
)
return 1
print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True,
)
return exit_code
if stop_requested:
print(
f"worker[{args.job_id}:{args.execution_id}]: stopping after graceful request",
flush=True,
)
return 130
if exit_code == 0:
try:
publish_staged_feed(out_dir=out_dir, feed_slug=feed.slug)
except Exception as error:
print(
f"worker[{args.job_id}:{args.execution_id}]: publish failed: {error}",
flush=True,
)
return 1
print(
f"worker[{args.job_id}:{args.execution_id}]: completed successfully",
flush=True,
)
return exit_code
def _load_job_source_config(*, db_path: str, job_id: int) -> JobSourceConfig: