implement scrapy + pygea job runner

This commit is contained in:
Abel Luck 2026-03-30 15:04:41 +02:00
parent 916968c579
commit 8af28c2f68
8 changed files with 888 additions and 163 deletions

View file

@ -188,14 +188,12 @@ class JobRuntime:
str(job_id),
"--execution-id",
str(execution_id),
"--db-path",
str(database.database),
"--out-dir",
str(self.log_dir.parent),
"--stats-path",
str(artifacts.stats_path),
"--duration-seconds",
str(self.worker_duration_seconds),
"--interval-seconds",
str(self.worker_stats_interval_seconds),
"--failure-probability",
str(self.worker_failure_probability),
],
stdout=log_handle,
stderr=subprocess.STDOUT,
@ -390,7 +388,7 @@ def load_runs_view(
for job in jobs
),
"completed": tuple(
_project_completed_execution(execution, resolved_log_dir)
_project_completed_execution(execution, resolved_log_dir, reference_time)
for execution in completed_executions
),
}
@ -401,6 +399,7 @@ def load_dashboard_view(
) -> dict[str, object]:
reference_time = now or datetime.now(UTC)
runs_view = load_runs_view(log_dir=log_dir, now=reference_time)
output_dir = Path(log_dir).parent
with database.connection_context():
failed_last_day = (
JobExecution.select()
@ -414,7 +413,7 @@ def load_dashboard_view(
upcoming_ready = sum(
1 for job in runs_view["upcoming"] if str(job["run_reason"]) == "Ready"
)
footprint_bytes = _directory_size(Path(log_dir))
footprint_bytes = _directory_size(output_dir)
return {
"running": runs_view["running"],
"snapshot": {
@ -538,7 +537,7 @@ def _project_upcoming_job(
"slug": job.source.slug,
"job_id": job_id,
"next_run": (
_humanize_future_time(reference_time, next_run)
_humanize_relative_time(reference_time, next_run)
if next_run is not None
else ("Running now" if running_execution is not None else "Not scheduled")
),
@ -565,7 +564,7 @@ def _project_upcoming_job(
def _project_completed_execution(
execution: JobExecution, log_dir: Path
execution: JobExecution, log_dir: Path, reference_time: datetime
) -> dict[str, object]:
job = cast(Job, execution.job)
job_id = _job_id(job)
@ -573,18 +572,22 @@ def _project_completed_execution(
artifacts = JobArtifacts.for_execution(
log_dir=log_dir, job_id=job_id, execution_id=execution_id
)
ended_at = (
_coerce_datetime(cast(datetime | str, execution.ended_at))
if execution.ended_at is not None
else None
)
return {
"source": job.source.name,
"slug": job.source.slug,
"job_id": job_id,
"execution_id": execution_id,
"ended_at": (
_coerce_datetime(cast(datetime | str, execution.ended_at)).strftime(
"%Y-%m-%d %H:%M UTC"
)
if execution.ended_at is not None
_humanize_relative_time(reference_time, ended_at)
if ended_at is not None
else "Pending"
),
"ended_at_iso": ended_at.isoformat() if ended_at is not None else None,
"status": _execution_status_label(execution),
"status_tone": _execution_status_tone(execution),
"stats": _stats_summary(execution),
@ -678,20 +681,25 @@ def _format_bytes(value: int) -> str:
return f"{value / (1024 * 1024 * 1024):.1f} GB"
def _humanize_future_time(reference_time: datetime, target_time: datetime) -> str:
def _humanize_relative_time(reference_time: datetime, target_time: datetime) -> str:
delta_seconds = int(round((target_time - reference_time).total_seconds()))
if delta_seconds <= 0:
if delta_seconds == 0:
return "now"
absolute_delta_seconds = abs(delta_seconds)
units = (
("day", 24 * 60 * 60),
("hour", 60 * 60),
("minute", 60),
)
for label, size in units:
if delta_seconds >= size:
count = max(1, round(delta_seconds / size))
if absolute_delta_seconds >= size:
count = max(1, round(absolute_delta_seconds / size))
suffix = "" if count == 1 else "s"
return f"in {count} {label}{suffix}"
if delta_seconds > 0:
return f"in {count} {label}{suffix}"
return f"{count} {label}{suffix} ago"
return f"in {delta_seconds} seconds"
if delta_seconds > 0:
return f"in {absolute_delta_seconds} seconds"
return f"{absolute_delta_seconds} seconds ago"