republisher/tests/test_scheduler_runtime.py

1165 lines
38 KiB
Python

from __future__ import annotations
import asyncio
import json
import socketserver
import subprocess
import sys
import threading
import time
from datetime import UTC, datetime, timedelta
from http.server import BaseHTTPRequestHandler
from pathlib import Path
from repub.job_runner import generate_pangea_feed
from repub.jobs import JobArtifacts, JobRuntime, load_runs_view
from repub.model import (
Job,
JobExecution,
JobExecutionStatus,
Source,
create_source,
initialize_database,
save_setting,
)
from repub.web import create_app, get_job_runtime, render_execution_logs, render_runs
FIXTURE_FEED_PATH = (
Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss"
).resolve()
def initialize_runtime_database(db_path: Path) -> None:
initialize_database(db_path)
save_setting("feed_url", "http://localhost:8080")
def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None:
initialize_runtime_database(tmp_path / "scheduler.db")
enabled_source = create_source(
name="Enabled source",
slug="enabled-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/enabled.xml",
)
disabled_source = create_source(
name="Disabled source",
slug="disabled-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="15",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/disabled.xml",
)
enabled_job = Job.get(Job.source == enabled_source)
disabled_job = Job.get(Job.source == disabled_source)
runtime = JobRuntime(log_dir=tmp_path / "out" / "logs")
try:
runtime.start()
runtime.sync_jobs()
scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()}
assert f"job-{enabled_job.id}" in scheduled_ids
assert f"job-{disabled_job.id}" not in scheduled_ids
enabled_job.enabled = False
enabled_job.save()
runtime.sync_jobs()
scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()}
assert f"job-{enabled_job.id}" not in scheduled_ids
finally:
runtime.shutdown()
def test_job_runtime_run_now_writes_log_and_stats_and_marks_success(
tmp_path: Path,
) -> None:
initialize_runtime_database(tmp_path / "run-now.db")
source = create_source(
name="Manual source",
slug="manual-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
job = Job.get(Job.source == source)
runtime = JobRuntime(log_dir=tmp_path / "out" / "logs")
try:
runtime.start()
execution_id = runtime.run_job_now(job.id, reason="manual")
assert execution_id is not None
execution = _wait_for_terminal_execution(execution_id)
artifacts = JobArtifacts.for_execution(
log_dir=tmp_path / "out" / "logs",
job_id=job.id,
execution_id=execution_id,
)
assert execution.running_status == JobExecutionStatus.SUCCEEDED
assert execution.started_at is not None
assert execution.ended_at is not None
assert execution.requests_count > 0
assert execution.items_count > 0
assert execution.bytes_count > 0
assert artifacts.log_path.exists()
assert artifacts.stats_path.exists()
output_path = tmp_path / "out" / "feeds" / "manual-source" / "feed.rss"
assert output_path.exists()
output_text = output_path.read_text(encoding="utf-8")
assert "<title>Local Demo Feed</title>" in output_text
assert "<title>Local Demo Entry</title>" in output_text
stats_lines = [
json.loads(line)
for line in artifacts.stats_path.read_text(encoding="utf-8").splitlines()
]
assert len(stats_lines) >= 2
assert stats_lines[-1]["requests_count"] == execution.requests_count
finally:
runtime.shutdown()
def test_job_runtime_respects_max_concurrent_jobs_setting(tmp_path: Path) -> None:
db_path = tmp_path / "max-concurrency.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
first_source = create_source(
name="First source",
slug="first-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
second_source = create_source(
name="Second source",
slug="second-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
first_job = Job.get(Job.source == first_source)
second_job = Job.get(Job.source == second_source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
first_execution_id = runtime.run_job_now(first_job.id, reason="manual")
assert first_execution_id is not None
_wait_for_running_execution(first_execution_id)
second_execution_id = runtime.run_job_now(second_job.id, reason="manual")
assert second_execution_id is not None
second_execution = _wait_for_execution_status(
second_execution_id,
JobExecutionStatus.PENDING,
)
assert (
JobExecution.select()
.where(JobExecution.running_status == JobExecutionStatus.RUNNING)
.count()
== 1
)
assert second_execution.started_at is None
assert (
JobExecution.select()
.where(JobExecution.running_status == JobExecutionStatus.PENDING)
.count()
== 1
)
runtime.request_execution_cancel(first_execution_id)
finished_execution = _wait_for_terminal_execution(first_execution_id)
assert finished_execution.running_status == JobExecutionStatus.CANCELED
finally:
runtime.shutdown()
def test_job_runtime_starts_queued_execution_after_capacity_opens(
tmp_path: Path,
) -> None:
db_path = tmp_path / "drain-queue.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
first_source = create_source(
name="First source",
slug="first-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
second_source = create_source(
name="Second source",
slug="second-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
first_job = Job.get(Job.source == first_source)
second_job = Job.get(Job.source == second_source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
first_execution_id = runtime.run_job_now(first_job.id, reason="manual")
assert first_execution_id is not None
_wait_for_running_execution(first_execution_id)
second_execution_id = runtime.run_job_now(second_job.id, reason="manual")
assert second_execution_id is not None
_wait_for_execution_status(second_execution_id, JobExecutionStatus.PENDING)
runtime.request_execution_cancel(first_execution_id)
finished_execution = _wait_for_terminal_execution(first_execution_id)
assert finished_execution.running_status == JobExecutionStatus.CANCELED
_wait_for_running_execution(second_execution_id)
drained_execution = _wait_for_terminal_execution(second_execution_id)
assert drained_execution.running_status == JobExecutionStatus.SUCCEEDED
assert drained_execution.started_at is not None
finally:
runtime.shutdown()
def test_job_runtime_deduplicates_manual_queue_requests(tmp_path: Path) -> None:
db_path = tmp_path / "queue-dedup.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
blocking_source = create_source(
name="Blocking source",
slug="blocking-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
queued_source = create_source(
name="Queued source",
slug="queued-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/queued.xml",
)
blocking_job = Job.get(Job.source == blocking_source)
queued_job = Job.get(Job.source == queued_source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
blocking_execution_id = runtime.run_job_now(
blocking_job.id, reason="manual"
)
assert blocking_execution_id is not None
_wait_for_running_execution(blocking_execution_id)
first_pending_id = runtime.run_job_now(queued_job.id, reason="manual")
second_pending_id = runtime.run_job_now(queued_job.id, reason="manual")
assert first_pending_id is not None
assert second_pending_id == first_pending_id
assert (
JobExecution.select()
.where(
(JobExecution.job == queued_job)
& (JobExecution.running_status == JobExecutionStatus.PENDING)
)
.count()
== 1
)
finally:
runtime.shutdown()
def test_job_runtime_allows_one_running_and_one_pending_per_job(
tmp_path: Path,
) -> None:
db_path = tmp_path / "running-plus-pending.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
source = create_source(
name="Busy source",
slug="busy-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
running_execution_id = runtime.run_job_now(job.id, reason="manual")
assert running_execution_id is not None
_wait_for_running_execution(running_execution_id)
pending_execution_id = runtime.run_job_now(job.id, reason="manual")
duplicate_pending_id = runtime.run_job_now(job.id, reason="manual")
runtime.run_scheduled_job(job.id)
assert pending_execution_id is not None
assert duplicate_pending_id == pending_execution_id
assert (
JobExecution.select()
.where(JobExecution.job == job)
.where(JobExecution.running_status == JobExecutionStatus.RUNNING)
.count()
== 1
)
assert (
JobExecution.select()
.where(JobExecution.job == job)
.where(JobExecution.running_status == JobExecutionStatus.PENDING)
.count()
== 1
)
finally:
runtime.shutdown()
def test_job_runtime_start_drains_pending_rows_created_before_start(
tmp_path: Path,
) -> None:
db_path = tmp_path / "startup-drain.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
source = create_source(
name="Queued source",
slug="queued-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
job = Job.get(Job.source == source)
pending_execution = JobExecution.create(
job=job,
running_status=JobExecutionStatus.PENDING,
)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
_wait_for_running_execution(int(pending_execution.get_id()))
drained_execution = _wait_for_terminal_execution(
int(pending_execution.get_id())
)
assert drained_execution.running_status == JobExecutionStatus.SUCCEEDED
assert drained_execution.started_at is not None
finally:
runtime.shutdown()
def test_job_runtime_scheduled_runs_use_the_persistent_queue(
tmp_path: Path,
) -> None:
db_path = tmp_path / "scheduled-queue.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
first_source = create_source(
name="First scheduled source",
slug="first-scheduled-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
second_source = create_source(
name="Second scheduled source",
slug="second-scheduled-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/second-scheduled.xml",
)
first_job = Job.get(Job.source == first_source)
second_job = Job.get(Job.source == second_source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
runtime.run_scheduled_job(first_job.id)
first_execution = JobExecution.get(JobExecution.job == first_job)
_wait_for_running_execution(int(first_execution.get_id()))
runtime.run_scheduled_job(second_job.id)
second_execution = JobExecution.get(JobExecution.job == second_job)
assert second_execution.running_status == JobExecutionStatus.PENDING
assert second_execution.started_at is None
finally:
runtime.shutdown()
def test_job_runtime_cancel_pending_follow_up_keeps_running_worker_alive(
tmp_path: Path,
) -> None:
db_path = tmp_path / "cancel-pending.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
save_setting("max_concurrent_jobs", 1)
with _slow_feed_server() as feed_url:
source = create_source(
name="Cancelable queued source",
slug="cancelable-queued-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
runtime = JobRuntime(log_dir=log_dir)
try:
runtime.start()
running_execution_id = runtime.run_job_now(job.id, reason="manual")
assert running_execution_id is not None
_wait_for_running_execution(running_execution_id)
pending_execution_id = runtime.run_job_now(job.id, reason="manual")
assert pending_execution_id is not None
_wait_for_execution_status(pending_execution_id, JobExecutionStatus.PENDING)
assert runtime.cancel_queued_execution(pending_execution_id) is True
assert JobExecution.get_or_none(id=pending_execution_id) is None
assert (
JobExecution.get_by_id(running_execution_id).running_status
== JobExecutionStatus.RUNNING
)
finally:
runtime.shutdown()
def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None:
initialize_runtime_database(tmp_path / "cancel.db")
with _slow_feed_server() as feed_url:
source = create_source(
name="Cancelable source",
slug="cancelable-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
runtime = JobRuntime(log_dir=tmp_path / "out" / "logs")
try:
runtime.start()
execution_id = runtime.run_job_now(job.id, reason="manual")
assert execution_id is not None
_wait_for_running_execution(execution_id)
runtime.request_execution_cancel(execution_id)
execution = _wait_for_terminal_execution(execution_id)
artifacts = JobArtifacts.for_execution(
log_dir=tmp_path / "out" / "logs",
job_id=job.id,
execution_id=execution_id,
)
assert execution.running_status == JobExecutionStatus.CANCELED
assert execution.ended_at is not None
assert execution.stop_requested_at is not None
assert "graceful stop requested" in artifacts.log_path.read_text(
encoding="utf-8"
)
finally:
runtime.shutdown()
def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None:
initialize_runtime_database(tmp_path / "stale-running.db")
source = create_source(
name="Stale source",
slug="stale-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/stale.xml",
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
started_at="2026-03-30 12:30:00+00:00",
running_status=JobExecutionStatus.RUNNING,
)
artifacts = JobArtifacts.for_execution(
log_dir=tmp_path / "out" / "logs",
job_id=job.id,
execution_id=int(execution.get_id()),
)
artifacts.log_path.parent.mkdir(parents=True, exist_ok=True)
artifacts.log_path.write_text(
"worker: process lost during app restart\n",
encoding="utf-8",
)
runtime = JobRuntime(log_dir=tmp_path / "out" / "logs")
try:
runtime.start()
reconciled_execution = JobExecution.get_by_id(execution.get_id())
assert reconciled_execution.running_status == JobExecutionStatus.FAILED
assert reconciled_execution.ended_at is not None
assert "marked failed after app restart" in artifacts.log_path.read_text(
encoding="utf-8"
)
finally:
runtime.shutdown()
def test_job_runtime_publishes_refresh_while_jobs_are_running(tmp_path: Path) -> None:
initialize_runtime_database(tmp_path / "runtime-refresh.db")
source = create_source(
name="Running source",
slug="running-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/running.xml",
)
job = Job.get(Job.source == source)
JobExecution.create(
job=job,
started_at=datetime(2026, 3, 30, 12, 0, tzinfo=UTC),
running_status=JobExecutionStatus.RUNNING,
)
events: list[object] = []
runtime = JobRuntime(
log_dir=tmp_path / "out" / "logs",
refresh_callback=events.append,
)
runtime._last_runtime_refresh_at = time.monotonic() - 2.0
runtime.poll_workers()
assert "refresh-event" in events
def test_job_runtime_start_reattaches_live_worker_after_app_restart(
tmp_path: Path,
) -> None:
db_path = tmp_path / "live-worker.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
with _slow_feed_server() as feed_url:
source = create_source(
name="Live worker source",
slug="live-worker-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
started_at=datetime.now(UTC),
running_status=JobExecutionStatus.RUNNING,
)
artifacts = JobArtifacts.for_execution(
log_dir=log_dir,
job_id=job.id,
execution_id=int(execution.get_id()),
)
artifacts.log_path.parent.mkdir(parents=True, exist_ok=True)
log_handle = artifacts.log_path.open("a", encoding="utf-8", buffering=1)
process = subprocess.Popen(
[
sys.executable,
"-u",
"-m",
"repub.job_runner",
"--job-id",
str(job.id),
"--execution-id",
str(execution.get_id()),
"--db-path",
str(db_path),
"--out-dir",
str(log_dir.parent),
"--stats-path",
str(artifacts.stats_path),
],
stdout=log_handle,
stderr=subprocess.STDOUT,
text=True,
)
runtime = JobRuntime(log_dir=log_dir)
try:
time.sleep(0.1)
runtime.start()
running_execution = JobExecution.get_by_id(execution.get_id())
assert running_execution.running_status == JobExecutionStatus.RUNNING
assert running_execution.ended_at is None
completed_execution = _wait_for_terminal_execution(int(execution.get_id()))
assert completed_execution.running_status == JobExecutionStatus.SUCCEEDED
assert "reattached" in artifacts.log_path.read_text(encoding="utf-8")
finally:
runtime.shutdown()
if process.poll() is None:
process.kill()
process.wait(timeout=2)
log_handle.close()
def test_job_runtime_start_restores_live_worker_marked_failed_by_restart_bug(
tmp_path: Path,
) -> None:
db_path = tmp_path / "restore-live-worker.db"
log_dir = tmp_path / "out" / "logs"
initialize_runtime_database(db_path)
with _slow_feed_server() as feed_url:
source = create_source(
name="Recovered worker source",
slug="recovered-worker-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=feed_url,
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
started_at=datetime.now(UTC),
ended_at=datetime.now(UTC),
running_status=JobExecutionStatus.FAILED,
)
artifacts = JobArtifacts.for_execution(
log_dir=log_dir,
job_id=job.id,
execution_id=int(execution.get_id()),
)
artifacts.log_path.parent.mkdir(parents=True, exist_ok=True)
log_handle = artifacts.log_path.open("a", encoding="utf-8", buffering=1)
process = subprocess.Popen(
[
sys.executable,
"-u",
"-m",
"repub.job_runner",
"--job-id",
str(job.id),
"--execution-id",
str(execution.get_id()),
"--db-path",
str(db_path),
"--out-dir",
str(log_dir.parent),
"--stats-path",
str(artifacts.stats_path),
],
stdout=log_handle,
stderr=subprocess.STDOUT,
text=True,
)
runtime = JobRuntime(log_dir=log_dir)
try:
time.sleep(0.1)
runtime.start()
restored_execution = JobExecution.get_by_id(execution.get_id())
assert restored_execution.running_status == JobExecutionStatus.RUNNING
assert restored_execution.ended_at is None
completed_execution = _wait_for_terminal_execution(int(execution.get_id()))
assert completed_execution.running_status == JobExecutionStatus.SUCCEEDED
assert "restored execution state" in artifacts.log_path.read_text(
encoding="utf-8"
)
finally:
runtime.shutdown()
if process.poll() is None:
process.kill()
process.wait(timeout=2)
log_handle.close()
def test_generate_pangea_feed_writes_pangea_rss_file(
monkeypatch, tmp_path: Path
) -> None:
class StubPangeaFeed:
def __init__(self, config, feeds):
self.config = config
self.feed = feeds[0]
def acquire_content(self) -> None:
return None
def generate_feed(self) -> None:
return None
def disgorge(self, slug: str):
output_path = self.config.results.output_directory / slug / "pangea.rss"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
"<rss><channel><title>Pangea Fixture</title></channel></rss>\n",
encoding="utf-8",
)
return output_path
monkeypatch.setattr(
"repub.job_runner.pangea_feed_class",
lambda: StubPangeaFeed,
)
output_path = generate_pangea_feed(
name="Pangea source",
slug="pangea-source",
domain="example.org",
category_name="News",
content_type="articles",
only_newest=True,
max_articles=10,
oldest_article=3,
include_authors=True,
exclude_media=False,
include_content=True,
content_format="MOBILE_3",
out_dir=tmp_path / "out",
log_path=tmp_path / "out" / "logs" / "pangea.log",
)
assert output_path == (tmp_path / "out" / "feeds" / "pangea-source" / "pangea.rss")
assert output_path.exists()
assert "Pangea Fixture" in output_path.read_text(encoding="utf-8")
def test_load_runs_view_humanizes_completed_execution_end_time(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "runs-view.db"
log_dir = tmp_path / "out" / "logs"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = log_dir
source = create_source(
name="Completed source",
slug="completed-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/completed.xml",
)
job = Job.get(Job.source == source)
reference_time = datetime(2026, 1, 15, 12, 0, tzinfo=UTC)
ended_at = reference_time - timedelta(hours=2)
JobExecution.create(
job=job,
running_status=JobExecutionStatus.SUCCEEDED,
ended_at=ended_at,
)
view = load_runs_view(log_dir=app.config["REPUB_LOG_DIR"], now=reference_time)
completed = view["completed"][0]
assert completed["ended_at"] == "2 hours ago"
assert completed["ended_at_iso"] == ended_at.isoformat()
def test_render_runs_uses_database_backed_jobs_and_executions(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "runs-page.db"
log_dir = tmp_path / "out" / "logs"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = log_dir
save_setting("feed_url", "http://localhost:8080")
source = create_source(
name="Runs page source",
slug="runs-page-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url=FIXTURE_FEED_PATH.as_uri(),
)
job = Job.get(Job.source == source)
runtime = get_job_runtime(app)
runtime.start()
try:
execution_id = runtime.run_job_now(job.id, reason="manual")
assert execution_id is not None
execution = _wait_for_terminal_execution(execution_id)
async def run() -> None:
body = str(await render_runs(app))
assert "runs-page-source" in body
assert "Running jobs" in body
assert "Scheduled jobs" in body
assert "Completed job executions" in body
assert f"/job/{job.id}/execution/{execution.get_id()}/logs" in body
assert "Succeeded" in body
assert "Run now" in body
asyncio.run(run())
finally:
runtime.shutdown()
def test_render_execution_logs_handles_missing_execution_and_missing_log_file(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "log-errors.db"
log_dir = tmp_path / "out" / "logs"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
app = create_app()
app.config["REPUB_LOG_DIR"] = log_dir
source = create_source(
name="Log source",
slug="log-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=False,
cron_minute="*/5",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/log-source.xml",
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
running_status=JobExecutionStatus.FAILED,
)
async def run() -> None:
missing_execution = str(
await render_execution_logs(app, job_id=job.id, execution_id=9999)
)
missing_log = str(
await render_execution_logs(app, job_id=job.id, execution_id=execution.id)
)
assert "Execution log unavailable" in missing_execution
assert "Execution does not exist." in missing_execution
assert "Execution log unavailable" in missing_log
assert "Log file has not been created yet." in missing_log
asyncio.run(run())
def test_delete_job_action_removes_source_job_and_execution_history(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "delete-job.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
async def run() -> None:
app = create_app()
client = app.test_client()
source = create_source(
name="Delete source",
slug="delete-source",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*/30",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/delete.xml",
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
running_status=JobExecutionStatus.SUCCEEDED,
)
response = await client.post(f"/actions/jobs/{job.id}/delete")
assert response.status_code == 204
assert Source.get_or_none(Source.slug == "delete-source") is None
assert Job.get_or_none(id=job.id) is None
assert JobExecution.get_or_none(id=int(execution.get_id())) is None
asyncio.run(run())
def test_delete_source_action_removes_source_job_and_execution_history(
monkeypatch, tmp_path: Path
) -> None:
db_path = tmp_path / "delete-source.db"
monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path))
async def run() -> None:
app = create_app()
client = app.test_client()
source = create_source(
name="Delete source row",
slug="delete-source-row",
source_type="feed",
notes="",
spider_arguments="",
enabled=True,
cron_minute="*/30",
cron_hour="*",
cron_day_of_month="*",
cron_day_of_week="*",
cron_month="*",
feed_url="https://example.com/delete-source-row.xml",
)
job = Job.get(Job.source == source)
execution = JobExecution.create(
job=job,
running_status=JobExecutionStatus.SUCCEEDED,
)
response = await client.post("/actions/sources/delete-source-row/delete")
assert response.status_code == 204
assert Source.get_or_none(Source.slug == "delete-source-row") is None
assert Job.get_or_none(id=job.id) is None
assert JobExecution.get_or_none(id=int(execution.get_id())) is None
asyncio.run(run())
def _wait_for_running_execution(
execution_id: int, *, timeout_seconds: float = 2.0
) -> JobExecution:
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
execution = JobExecution.get_by_id(execution_id)
if execution.running_status == JobExecutionStatus.RUNNING:
return execution
time.sleep(0.02)
raise AssertionError(f"execution {execution_id} never entered RUNNING state")
def _wait_for_execution_status(
execution_id: int,
status: JobExecutionStatus,
*,
timeout_seconds: float = 2.0,
) -> JobExecution:
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
execution = JobExecution.get_by_id(execution_id)
if execution.running_status == status:
return execution
time.sleep(0.02)
raise AssertionError(f"execution {execution_id} never entered {status.name}")
def _wait_for_terminal_execution(
execution_id: int, *, timeout_seconds: float = 4.0
) -> JobExecution:
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
execution = JobExecution.get_by_id(execution_id)
if execution.running_status in {
JobExecutionStatus.SUCCEEDED,
JobExecutionStatus.FAILED,
JobExecutionStatus.CANCELED,
}:
return execution
time.sleep(0.02)
raise AssertionError(f"execution {execution_id} did not finish in time")
class _SlowFeedRequestHandler(BaseHTTPRequestHandler):
def do_GET(self) -> None: # noqa: N802
time.sleep(2.0)
payload = FIXTURE_FEED_PATH.read_bytes()
self.send_response(200)
self.send_header("Content-Type", "application/rss+xml; charset=utf-8")
self.send_header("Content-Length", str(len(payload)))
self.end_headers()
self.wfile.write(payload)
def log_message(self, format: str, *args: object) -> None:
del format, args
class _ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
allow_reuse_address = True
class _slow_feed_server:
def __enter__(self) -> str:
self._server = _ThreadedTCPServer(("127.0.0.1", 0), _SlowFeedRequestHandler)
self._thread = threading.Thread(
target=self._server.serve_forever,
kwargs={"poll_interval": 0.01},
daemon=True,
)
self._thread.start()
host = str(self._server.server_address[0])
port = int(self._server.server_address[1])
return f"http://{host}:{port}/slow-feed.rss"
def __exit__(self, exc_type, exc, tb) -> None:
del exc_type, exc, tb
self._server.shutdown()
self._server.server_close()
self._thread.join(timeout=1)