implement scrapy + pygea job runner

2026-03-30 15:04:41 +02:00 · 2026-03-30 15:04:41 +02:00 · 8af28c2f68
commit 8af28c2f68
parent 916968c579
8 changed files with 888 additions and 163 deletions
--- a/repub/jobs.py
+++ b/repub/jobs.py
@ -188,14 +188,12 @@ class JobRuntime:
                str(job_id),
                "--execution-id",
                str(execution_id),
+                "--db-path",
+                str(database.database),
+                "--out-dir",
+                str(self.log_dir.parent),
                "--stats-path",
                str(artifacts.stats_path),
-                "--duration-seconds",
-                str(self.worker_duration_seconds),
-                "--interval-seconds",
-                str(self.worker_stats_interval_seconds),
-                "--failure-probability",
-                str(self.worker_failure_probability),
            ],
            stdout=log_handle,
            stderr=subprocess.STDOUT,
@ -390,7 +388,7 @@ def load_runs_view(
            for job in jobs
        ),
        "completed": tuple(
-            _project_completed_execution(execution, resolved_log_dir)
+            _project_completed_execution(execution, resolved_log_dir, reference_time)
            for execution in completed_executions
        ),
    }
@ -401,6 +399,7 @@ def load_dashboard_view(
 ) -> dict[str, object]:
    reference_time = now or datetime.now(UTC)
    runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
+    output_dir = Path(log_dir).parent
    with database.connection_context():
        failed_last_day = (
            JobExecution.select()
@ -414,7 +413,7 @@ def load_dashboard_view(
    upcoming_ready = sum(
        1 for job in runs_view["upcoming"] if str(job["run_reason"]) == "Ready"
    )
-    footprint_bytes = _directory_size(Path(log_dir))
+    footprint_bytes = _directory_size(output_dir)
    return {
        "running": runs_view["running"],
        "snapshot": {
@ -538,7 +537,7 @@ def _project_upcoming_job(
        "slug": job.source.slug,
        "job_id": job_id,
        "next_run": (
-            _humanize_future_time(reference_time, next_run)
+            _humanize_relative_time(reference_time, next_run)
            if next_run is not None
            else ("Running now" if running_execution is not None else "Not scheduled")
        ),
@ -565,7 +564,7 @@ def _project_upcoming_job(


 def _project_completed_execution(
-    execution: JobExecution, log_dir: Path
+    execution: JobExecution, log_dir: Path, reference_time: datetime
 ) -> dict[str, object]:
    job = cast(Job, execution.job)
    job_id = _job_id(job)
@ -573,18 +572,22 @@ def _project_completed_execution(
    artifacts = JobArtifacts.for_execution(
        log_dir=log_dir, job_id=job_id, execution_id=execution_id
    )
+    ended_at = (
+        _coerce_datetime(cast(datetime | str, execution.ended_at))
+        if execution.ended_at is not None
+        else None
+    )
    return {
        "source": job.source.name,
        "slug": job.source.slug,
        "job_id": job_id,
        "execution_id": execution_id,
        "ended_at": (
-            _coerce_datetime(cast(datetime | str, execution.ended_at)).strftime(
-                "%Y-%m-%d %H:%M UTC"
-            )
-            if execution.ended_at is not None
+            _humanize_relative_time(reference_time, ended_at)
+            if ended_at is not None
            else "Pending"
        ),
+        "ended_at_iso": ended_at.isoformat() if ended_at is not None else None,
        "status": _execution_status_label(execution),
        "status_tone": _execution_status_tone(execution),
        "stats": _stats_summary(execution),
@ -678,20 +681,25 @@ def _format_bytes(value: int) -> str:
    return f"{value / (1024 * 1024 * 1024):.1f} GB"


-def _humanize_future_time(reference_time: datetime, target_time: datetime) -> str:
+def _humanize_relative_time(reference_time: datetime, target_time: datetime) -> str:
    delta_seconds = int(round((target_time - reference_time).total_seconds()))
-    if delta_seconds <= 0:
+    if delta_seconds == 0:
        return "now"

+    absolute_delta_seconds = abs(delta_seconds)
    units = (
        ("day", 24 * 60 * 60),
        ("hour", 60 * 60),
        ("minute", 60),
    )
    for label, size in units:
-        if delta_seconds >= size:
-            count = max(1, round(delta_seconds / size))
+        if absolute_delta_seconds >= size:
+            count = max(1, round(absolute_delta_seconds / size))
            suffix = "" if count == 1 else "s"
-            return f"in {count} {label}{suffix}"
+            if delta_seconds > 0:
+                return f"in {count} {label}{suffix}"
+            return f"{count} {label}{suffix} ago"

-    return f"in {delta_seconds} seconds"
+    if delta_seconds > 0:
+        return f"in {absolute_delta_seconds} seconds"
+    return f"{absolute_delta_seconds} seconds ago"