implement job runner and scheduler

2026-03-30 14:02:39 +02:00 · 2026-03-30 14:02:39 +02:00 · 2b2a3f1cc0
commit 2b2a3f1cc0
parent 328a70ff9b
11 changed files with 1572 additions and 284 deletions
--- a/tests/test_scheduler_runtime.py
+++ b/tests/test_scheduler_runtime.py
@ -0,0 +1,346 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import time
+from pathlib import Path
+
+from repub.jobs import JobArtifacts, JobRuntime
+from repub.model import (
+    Job,
+    JobExecution,
+    JobExecutionStatus,
+    Source,
+    create_source,
+    initialize_database,
+)
+from repub.web import create_app, get_job_runtime, render_execution_logs, render_runs
+
+
+def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
+    initialize_database(tmp_path / "scheduler.db")
+    enabled_source = create_source(
+        name="Enabled source",
+        slug="enabled-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=True,
+        cron_minute="*/5",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/enabled.xml",
+    )
+    disabled_source = create_source(
+        name="Disabled source",
+        slug="disabled-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=False,
+        cron_minute="15",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/disabled.xml",
+    )
+    enabled_job = Job.get(Job.source == enabled_source)
+    disabled_job = Job.get(Job.source == disabled_source)
+
+    runtime = JobRuntime(
+        log_dir=tmp_path / "out" / "logs",
+        worker_duration_seconds=0.4,
+        worker_stats_interval_seconds=0.05,
+        worker_failure_probability=0.0,
+    )
+    try:
+        runtime.start()
+        runtime.sync_jobs()
+
+        scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()}
+
+        assert f"job-{enabled_job.id}" in scheduled_ids
+        assert f"job-{disabled_job.id}" not in scheduled_ids
+
+        enabled_job.enabled = False
+        enabled_job.save()
+        runtime.sync_jobs()
+
+        scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()}
+        assert f"job-{enabled_job.id}" not in scheduled_ids
+    finally:
+        runtime.shutdown()
+
+
+def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
+    tmp_path: Path,
+) -> None:
+    initialize_database(tmp_path / "run-now.db")
+    source = create_source(
+        name="Manual source",
+        slug="manual-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=False,
+        cron_minute="*/5",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/manual.xml",
+    )
+    job = Job.get(Job.source == source)
+
+    runtime = JobRuntime(
+        log_dir=tmp_path / "out" / "logs",
+        worker_duration_seconds=0.35,
+        worker_stats_interval_seconds=0.05,
+        worker_failure_probability=0.0,
+    )
+    try:
+        runtime.start()
+        execution_id = runtime.run_job_now(job.id, reason="manual")
+        assert execution_id is not None
+        execution = _wait_for_terminal_execution(execution_id)
+        artifacts = JobArtifacts.for_execution(
+            log_dir=tmp_path / "out" / "logs",
+            job_id=job.id,
+            execution_id=execution_id,
+        )
+
+        assert execution.running_status == JobExecutionStatus.SUCCEEDED
+        assert execution.started_at is not None
+        assert execution.ended_at is not None
+        assert execution.requests_count > 0
+        assert execution.items_count > 0
+        assert execution.bytes_count > 0
+        assert artifacts.log_path.exists()
+        assert artifacts.stats_path.exists()
+        assert "starting simulated crawl" in artifacts.log_path.read_text(
+            encoding="utf-8"
+        )
+
+        stats_lines = [
+            json.loads(line)
+            for line in artifacts.stats_path.read_text(encoding="utf-8").splitlines()
+        ]
+        assert len(stats_lines) >= 2
+        assert stats_lines[-1]["requests_count"] == execution.requests_count
+    finally:
+        runtime.shutdown()
+
+
+def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
+    initialize_database(tmp_path / "cancel.db")
+    source = create_source(
+        name="Cancelable source",
+        slug="cancelable-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=False,
+        cron_minute="*/5",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/cancelable.xml",
+    )
+    job = Job.get(Job.source == source)
+
+    runtime = JobRuntime(
+        log_dir=tmp_path / "out" / "logs",
+        worker_duration_seconds=2.0,
+        worker_stats_interval_seconds=0.1,
+        worker_failure_probability=0.0,
+    )
+    try:
+        runtime.start()
+        execution_id = runtime.run_job_now(job.id, reason="manual")
+        assert execution_id is not None
+        _wait_for_running_execution(execution_id)
+
+        runtime.request_execution_cancel(execution_id)
+        execution = _wait_for_terminal_execution(execution_id)
+        artifacts = JobArtifacts.for_execution(
+            log_dir=tmp_path / "out" / "logs",
+            job_id=job.id,
+            execution_id=execution_id,
+        )
+
+        assert execution.running_status == JobExecutionStatus.CANCELED
+        assert execution.ended_at is not None
+        assert execution.stop_requested_at is not None
+        assert "graceful stop requested" in artifacts.log_path.read_text(
+            encoding="utf-8"
+        )
+    finally:
+        runtime.shutdown()
+
+
+def test_render_runs_uses_database_backed_jobs_and_executions(
+    monkeypatch, tmp_path: Path
+) -> None:
+    db_path = tmp_path / "runs-page.db"
+    log_dir = tmp_path / "out" / "logs"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
+    app = create_app()
+    app.config["REPUB_LOG_DIR"] = log_dir
+    app.config["REPUB_JOB_WORKER_DURATION_SECONDS"] = 0.35
+    app.config["REPUB_JOB_WORKER_STATS_INTERVAL_SECONDS"] = 0.05
+    app.config["REPUB_JOB_WORKER_FAILURE_PROBABILITY"] = 0.0
+
+    source = create_source(
+        name="Runs page source",
+        slug="runs-page-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=True,
+        cron_minute="*/5",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/runs-page.xml",
+    )
+    job = Job.get(Job.source == source)
+    runtime = get_job_runtime(app)
+    runtime.start()
+    try:
+        execution_id = runtime.run_job_now(job.id, reason="manual")
+        assert execution_id is not None
+        execution = _wait_for_terminal_execution(execution_id)
+
+        async def run() -> None:
+            body = str(await render_runs(app))
+
+            assert "runs-page-source" in body
+            assert "Running job executions" in body
+            assert "Upcoming jobs" in body
+            assert "Completed job executions" in body
+            assert f"/job/{job.id}/execution/{execution.get_id()}/logs" in body
+            assert "Succeeded" in body
+            assert "Run now" in body
+
+        asyncio.run(run())
+    finally:
+        runtime.shutdown()
+
+
+def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
+    monkeypatch, tmp_path: Path
+) -> None:
+    db_path = tmp_path / "log-errors.db"
+    log_dir = tmp_path / "out" / "logs"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
+    app = create_app()
+    app.config["REPUB_LOG_DIR"] = log_dir
+    source = create_source(
+        name="Log source",
+        slug="log-source",
+        source_type="feed",
+        notes="",
+        spider_arguments="",
+        enabled=False,
+        cron_minute="*/5",
+        cron_hour="*",
+        cron_day_of_month="*",
+        cron_day_of_week="*",
+        cron_month="*",
+        feed_url="https://example.com/log-source.xml",
+    )
+    job = Job.get(Job.source == source)
+    execution = JobExecution.create(
+        job=job,
+        running_status=JobExecutionStatus.FAILED,
+    )
+
+    async def run() -> None:
+        missing_execution = str(
+            await render_execution_logs(app, job_id=job.id, execution_id=9999)
+        )
+        missing_log = str(
+            await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
+        )
+
+        assert "Execution log unavailable" in missing_execution
+        assert "Execution does not exist." in missing_execution
+        assert "Execution log unavailable" in missing_log
+        assert "Log file has not been created yet." in missing_log
+
+    asyncio.run(run())
+
+
+def test_delete_job_action_removes_source_job_and_execution_history(
+    monkeypatch, tmp_path: Path
+) -> None:
+    db_path = tmp_path / "delete-job.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
+    async def run() -> None:
+        app = create_app()
+        client = app.test_client()
+
+        source = create_source(
+            name="Delete source",
+            slug="delete-source",
+            source_type="feed",
+            notes="",
+            spider_arguments="",
+            enabled=True,
+            cron_minute="*/30",
+            cron_hour="*",
+            cron_day_of_month="*",
+            cron_day_of_week="*",
+            cron_month="*",
+            feed_url="https://example.com/delete.xml",
+        )
+        job = Job.get(Job.source == source)
+        execution = JobExecution.create(
+            job=job,
+            running_status=JobExecutionStatus.SUCCEEDED,
+        )
+
+        response = await client.post(f"/actions/jobs/{job.id}/delete")
+
+        assert response.status_code == 204
+        assert Source.get_or_none(Source.slug == "delete-source") is None
+        assert Job.get_or_none(id=job.id) is None
+        assert JobExecution.get_or_none(id=int(execution.get_id())) is None
+
+    asyncio.run(run())
+
+
+def _wait_for_running_execution(
+    execution_id: int, *, timeout_seconds: float = 2.0
+) -> JobExecution:
+    deadline = time.monotonic() + timeout_seconds
+    while time.monotonic() < deadline:
+        execution = JobExecution.get_by_id(execution_id)
+        if execution.running_status == JobExecutionStatus.RUNNING:
+            return execution
+        time.sleep(0.02)
+    raise AssertionError(f"execution {execution_id} never entered RUNNING state")
+
+
+def _wait_for_terminal_execution(
+    execution_id: int, *, timeout_seconds: float = 4.0
+) -> JobExecution:
+    deadline = time.monotonic() + timeout_seconds
+    while time.monotonic() < deadline:
+        execution = JobExecution.get_by_id(execution_id)
+        if execution.running_status in {
+            JobExecutionStatus.SUCCEEDED,
+            JobExecutionStatus.FAILED,
+            JobExecutionStatus.CANCELED,
+        }:
+            return execution
+        time.sleep(0.02)
+    raise AssertionError(f"execution {execution_id} did not finish in time")
--- a/tests/test_web.py
+++ b/tests/test_web.py
@ -5,7 +5,15 @@ from pathlib import Path
 from typing import Any, cast

 from repub.datastar import RefreshBroker, render_sse_event, render_stream
-from repub.model import Job, Source, SourceFeed, SourcePangea, create_source
+from repub.model import (
+    Job,
+    JobExecution,
+    JobExecutionStatus,
+    Source,
+    SourceFeed,
+    SourcePangea,
+    create_source,
+)
 from repub.web import (
    create_app,
    get_refresh_broker,
@ -141,15 +149,20 @@ def test_render_stream_yields_on_connect_and_refresh() -> None:
    asyncio.run(run())


-def test_render_dashboard_shows_dashboard_information_architecture() -> None:
+def test_render_dashboard_shows_dashboard_information_architecture(
+    monkeypatch, tmp_path: Path
+) -> None:
+    db_path = tmp_path / "dashboard-render.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
    async def run() -> None:
-        body = str(await render_dashboard())
+        app = create_app()
+        body = str(await render_dashboard(app))

        assert "Operational snapshot" in body
        assert "Running executions" in body
        assert 'href="/sources"' in body
        assert 'href="/runs"' in body
-        assert "/job/7/execution/104/logs" in body
        assert "Create source" in body

    asyncio.run(run())
@ -569,27 +582,97 @@ def test_create_source_action_validates_duplicate_slug_and_pangea_type(
    asyncio.run(run())


-def test_render_runs_shows_running_upcoming_and_completed_tables() -> None:
+def test_render_runs_shows_running_upcoming_and_completed_tables(
+    monkeypatch, tmp_path: Path
+) -> None:
+    db_path = tmp_path / "runs-render.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
+
    async def run() -> None:
-        body = str(await render_runs())
+        app = create_app()
+
+        source = create_source(
+            name="Runs render source",
+            slug="runs-render-source",
+            source_type="feed",
+            notes="",
+            spider_arguments="",
+            enabled=True,
+            cron_minute="*/30",
+            cron_hour="*",
+            cron_day_of_month="*",
+            cron_day_of_week="*",
+            cron_month="*",
+            feed_url="https://example.com/runs.xml",
+        )
+        job = Job.get(Job.source == source)
+        execution = JobExecution.create(
+            job=job,
+            running_status=JobExecutionStatus.SUCCEEDED,
+        )
+
+        body = str(await render_runs(app))

        assert "Running job executions" in body
        assert "Upcoming jobs" in body
        assert "Completed job executions" in body
-        assert "Delete confirmation" in body
-        assert "/job/11/execution/101/logs" in body
-        assert "Already running" in body
+        assert "runs-render-source" in body
+        assert f"/job/{job.id}/execution/{execution.get_id()}/logs" in body
+        assert "Already running" not in body

    asyncio.run(run())


-def test_render_execution_logs_uses_app_route() -> None:
-    async def run() -> None:
-        body = str(await render_execution_logs(job_id=7, execution_id=104))
+def test_render_execution_logs_uses_app_route(monkeypatch, tmp_path: Path) -> None:
+    db_path = tmp_path / "logs-render.db"
+    monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))

-        assert "Job 7 / execution 104" in body
-        assert "/job/7/execution/104/logs" in body
-        assert "Streaming text log view" in body
+    async def run() -> None:
+        log_dir = tmp_path / "out" / "logs"
+        app = create_app()
+        app.config["REPUB_LOG_DIR"] = log_dir
+
+        source = create_source(
+            name="Log render source",
+            slug="log-render-source",
+            source_type="feed",
+            notes="",
+            spider_arguments="",
+            enabled=False,
+            cron_minute="*/30",
+            cron_hour="*",
+            cron_day_of_month="*",
+            cron_day_of_week="*",
+            cron_month="*",
+            feed_url="https://example.com/logs.xml",
+        )
+        job = Job.get(Job.source == source)
+        execution = JobExecution.create(
+            job=job,
+            running_status=JobExecutionStatus.RUNNING,
+        )
+        log_path = log_dir / f"job-{job.id}-execution-{execution.get_id()}.log"
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        log_path.write_text(
+            "\n".join(
+                (
+                    "scheduler: run_now requested",
+                    "worker: starting simulated crawl",
+                    "worker: waiting for more log lines ...",
+                )
+            ),
+            encoding="utf-8",
+        )
+
+        body = str(
+            await render_execution_logs(
+                app, job_id=job.id, execution_id=int(execution.get_id())
+            )
+        )
+
+        assert f"Job {job.id} / execution {execution.get_id()}" in body
+        assert f"/job/{job.id}/execution/{execution.get_id()}/logs" in body
+        assert "Route: /job/" in body
        assert "waiting for more log lines" in body

    asyncio.run(run())