Prune old job executions
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-06-02 11:31:39 +02:00
parent 813f19f355
commit 710ac76192
6 changed files with 552 additions and 11 deletions

View file

@ -22,6 +22,11 @@ from repub.config import (
build_base_settings,
load_config,
)
from repub.job_retention import (
DEFAULT_SUCCESSFUL_EXECUTION_RETENTION_DAYS,
DEFAULT_UNSUCCESSFUL_EXECUTION_RETENTION_DAYS,
cleanup_job_executions,
)
from repub.web import SHUTDOWN_EVENT_KEY, create_app
FeedNameFilter = crawl_module.FeedNameFilter
@ -97,6 +102,11 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
default=None,
help="Published feeds directory to clean (default: config out_dir/feeds or out/feeds)",
)
cleanup_parser.add_argument(
"--log-dir",
default=None,
help="Job execution log directory to clean (default: config out_dir/logs or alongside feeds)",
)
cleanup_parser.add_argument(
"--days",
type=int,
@ -121,11 +131,12 @@ def parse_args(argv: list[str] | None = None) -> tuple[str, argparse.Namespace]:
return command, args
def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
def _cleanup_config(args: argparse.Namespace) -> tuple[Path, Path, tuple[str, ...]]:
feeds_dir = Path(args.feeds_dir) if args.feeds_dir else Path("out/feeds")
log_dir = Path(args.log_dir) if args.log_dir else feeds_dir.parent / "logs"
media_dirs = DEFAULT_MEDIA_DIRS
if args.config is None:
return feeds_dir, media_dirs
return feeds_dir, log_dir, media_dirs
config = load_config(args.config)
settings = build_base_settings(config)
@ -137,7 +148,9 @@ def _cleanup_config(args: argparse.Namespace) -> tuple[Path, tuple[str, ...]]:
)
if args.feeds_dir is None:
feeds_dir = config.out_dir / "feeds"
return feeds_dir, media_dirs
if args.log_dir is None:
log_dir = config.out_dir / "logs"
return feeds_dir, log_dir, media_dirs
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
@ -187,7 +200,7 @@ def entrypoint(argv: list[str] | None = None) -> int:
if command == "cleanup-media":
try:
feeds_dir, media_dirs = _cleanup_config(args)
feeds_dir, log_dir, media_dirs = _cleanup_config(args)
except FileNotFoundError as error:
missing_path = (
Path(error.filename).expanduser()
@ -206,7 +219,13 @@ def entrypoint(argv: list[str] | None = None) -> int:
dry_run=bool(args.dry_run),
media_dirs=media_dirs,
)
return 1 if result.failures else 0
job_result = cleanup_job_executions(
log_dir=log_dir,
successful_days=DEFAULT_SUCCESSFUL_EXECUTION_RETENTION_DAYS,
unsuccessful_days=DEFAULT_UNSUCCESSFUL_EXECUTION_RETENTION_DAYS,
dry_run=bool(args.dry_run),
)
return 1 if result.failures or job_result.failures else 0
try:
port = int(args.port)

220
repub/job_retention.py Normal file
View file

@ -0,0 +1,220 @@
from __future__ import annotations
import sys
from dataclasses import dataclass
from datetime import UTC, datetime, timedelta
from pathlib import Path
from typing import TextIO, cast
from repub.db import get_database_connection, initialize_database
from repub.jobs import JobArtifacts
from repub.model import Job, JobExecution, JobExecutionStatus, database
DEFAULT_SUCCESSFUL_EXECUTION_RETENTION_DAYS = 7
DEFAULT_UNSUCCESSFUL_EXECUTION_RETENTION_DAYS = 90
UNSUCCESSFUL_EXECUTION_STATUSES = (
JobExecutionStatus.FAILED,
JobExecutionStatus.CANCELED,
)
@dataclass
class JobExecutionRetentionResult:
log_dir: Path
successful_cutoff: datetime
unsuccessful_cutoff: datetime
dry_run: bool
matched_executions: int = 0
deleted_executions: int = 0
matched_files: int = 0
deleted_files: int = 0
bytes_deleted: int = 0
failures: int = 0
@dataclass(frozen=True)
class _ExecutionRetentionCandidate:
execution_id: int
job_id: int
status: JobExecutionStatus
ended_at: datetime
def cleanup_job_executions(
*,
log_dir: str | Path,
successful_days: int = DEFAULT_SUCCESSFUL_EXECUTION_RETENTION_DAYS,
unsuccessful_days: int = DEFAULT_UNSUCCESSFUL_EXECUTION_RETENTION_DAYS,
now: datetime | None = None,
dry_run: bool = False,
output: TextIO = sys.stdout,
) -> JobExecutionRetentionResult:
if get_database_connection() is None:
initialize_database()
reference_time = _coerce_datetime(now or datetime.now(UTC))
successful_cutoff = reference_time - timedelta(days=successful_days)
unsuccessful_cutoff = reference_time - timedelta(days=unsuccessful_days)
resolved_log_dir = Path(log_dir).resolve()
result = JobExecutionRetentionResult(
log_dir=resolved_log_dir,
successful_cutoff=successful_cutoff,
unsuccessful_cutoff=unsuccessful_cutoff,
dry_run=dry_run,
)
candidates = _retention_candidates(
successful_cutoff=successful_cutoff,
unsuccessful_cutoff=unsuccessful_cutoff,
)
execution_ids_to_delete: list[int] = []
for candidate in candidates:
result.matched_executions += 1
print(
"job retention: matched "
f"execution_id={candidate.execution_id} "
f"job_id={candidate.job_id} "
f"status={candidate.status.name} "
f"ended_at={candidate.ended_at.isoformat()}",
file=output,
)
artifacts = JobArtifacts.for_execution(
log_dir=resolved_log_dir,
job_id=candidate.job_id,
execution_id=candidate.execution_id,
)
artifact_cleanup_succeeded = _cleanup_artifacts(
artifacts=artifacts,
result=result,
dry_run=dry_run,
output=output,
)
if dry_run or not artifact_cleanup_succeeded:
continue
execution_ids_to_delete.append(candidate.execution_id)
if execution_ids_to_delete:
with database.writer():
execution_primary_key = getattr(JobExecution, "_meta").primary_key
result.deleted_executions = (
JobExecution.delete()
.where(execution_primary_key.in_(tuple(execution_ids_to_delete)))
.execute()
)
print(
"job retention: "
f"dry_run={_bool_text(result.dry_run)} "
f"successful_cutoff={result.successful_cutoff.isoformat()} "
f"unsuccessful_cutoff={result.unsuccessful_cutoff.isoformat()} "
f"root={result.log_dir} "
f"matched_executions={result.matched_executions} "
f"deleted_executions={result.deleted_executions} "
f"matched_files={result.matched_files} "
f"deleted_files={result.deleted_files} "
f"bytes_deleted={result.bytes_deleted} "
f"failures={result.failures}",
file=output,
)
return result
def _retention_candidates(
*, successful_cutoff: datetime, unsuccessful_cutoff: datetime
) -> tuple[_ExecutionRetentionCandidate, ...]:
with database.reader():
executions = tuple(
JobExecution.select(JobExecution, Job)
.join(Job)
.where(
(
JobExecution.running_status.in_(
(
JobExecutionStatus.SUCCEEDED,
*UNSUCCESSFUL_EXECUTION_STATUSES,
)
)
)
& (JobExecution.ended_at.is_null(False))
)
)
candidates: list[_ExecutionRetentionCandidate] = []
for execution in executions:
status = JobExecutionStatus(int(execution.running_status))
ended_at = _coerce_datetime(cast(datetime | str, execution.ended_at))
if status == JobExecutionStatus.SUCCEEDED:
if ended_at >= successful_cutoff:
continue
elif status in UNSUCCESSFUL_EXECUTION_STATUSES:
if ended_at >= unsuccessful_cutoff:
continue
else:
continue
job = cast(Job, execution.job)
candidates.append(
_ExecutionRetentionCandidate(
execution_id=int(execution.get_id()),
job_id=int(job.get_id()),
status=status,
ended_at=ended_at,
)
)
return tuple(candidates)
def _cleanup_artifacts(
*,
artifacts: JobArtifacts,
result: JobExecutionRetentionResult,
dry_run: bool,
output: TextIO,
) -> bool:
succeeded = True
for path in artifacts.existing_paths():
result.matched_files += 1
try:
file_size = path.stat().st_size
except OSError as error:
result.failures += 1
succeeded = False
print(
f"job retention: stat failed path={path} error={error}",
file=output,
)
continue
if dry_run:
continue
try:
path.unlink()
except FileNotFoundError:
continue
except OSError as error:
result.failures += 1
succeeded = False
print(
f"job retention: delete failed path={path} error={error}",
file=output,
)
continue
result.deleted_files += 1
result.bytes_deleted += file_size
return succeeded
def _coerce_datetime(value: datetime | str) -> datetime:
if isinstance(value, datetime):
if value.tzinfo is None:
return value.replace(tzinfo=UTC)
return value.astimezone(UTC)
parsed = datetime.fromisoformat(value)
if parsed.tzinfo is None:
return parsed.replace(tzinfo=UTC)
return parsed.astimezone(UTC)
def _bool_text(value: bool) -> str:
return "true" if value else "false"

View file

@ -50,6 +50,16 @@ class JobArtifacts:
stats_path=log_dir / f"{prefix}.jsonl",
)
def existing_paths(self) -> tuple[Path, ...]:
prefix = self.log_path.with_suffix("").name
return tuple(
sorted(
path
for path in self.log_path.parent.glob(f"{prefix}.*")
if path.is_file()
)
)
@dataclass
class RunningWorker:
@ -793,8 +803,12 @@ def clear_completed_executions(*, log_dir: str | Path) -> int:
for execution in completed_executions:
job = cast(Job, execution.job)
prefix = f"job-{_job_id(job)}-execution-{_execution_id(execution)}"
for artifact_path in resolved_log_dir.glob(f"{prefix}.*"):
artifacts = JobArtifacts.for_execution(
log_dir=resolved_log_dir,
job_id=_job_id(job),
execution_id=_execution_id(execution),
)
for artifact_path in artifacts.existing_paths():
artifact_path.unlink(missing_ok=True)
execution_ids = tuple(