from __future__ import annotations import asyncio import json import socketserver import threading import time from datetime import UTC, datetime, timedelta from http.server import BaseHTTPRequestHandler from pathlib import Path from repub.job_runner import generate_pangea_feed from repub.jobs import JobArtifacts, JobRuntime, load_runs_view from repub.model import ( Job, JobExecution, JobExecutionStatus, Source, create_source, initialize_database, ) from repub.web import create_app, get_job_runtime, render_execution_logs, render_runs FIXTURE_FEED_PATH = ( Path(__file__).resolve().parents[1] / "demo" / "fixtures" / "local-feed.rss" ).resolve() def test_job_runtime_syncs_enabled_jobs_into_apscheduler(tmp_path: Path) -> None: initialize_database(tmp_path / "scheduler.db") enabled_source = create_source( name="Enabled source", slug="enabled-source", source_type="feed", notes="", spider_arguments="", enabled=True, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/enabled.xml", ) disabled_source = create_source( name="Disabled source", slug="disabled-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="15", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/disabled.xml", ) enabled_job = Job.get(Job.source == enabled_source) disabled_job = Job.get(Job.source == disabled_source) runtime = JobRuntime( log_dir=tmp_path / "out" / "logs", worker_duration_seconds=0.4, worker_stats_interval_seconds=0.05, worker_failure_probability=0.0, ) try: runtime.start() runtime.sync_jobs() scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()} assert f"job-{enabled_job.id}" in scheduled_ids assert f"job-{disabled_job.id}" not in scheduled_ids enabled_job.enabled = False enabled_job.save() runtime.sync_jobs() scheduled_ids = {job.id for job in runtime.scheduler.get_jobs()} assert f"job-{enabled_job.id}" not in scheduled_ids finally: runtime.shutdown() def test_job_runtime_run_now_writes_log_and_stats_and_marks_success( tmp_path: Path, ) -> None: initialize_database(tmp_path / "run-now.db") source = create_source( name="Manual source", slug="manual-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url=FIXTURE_FEED_PATH.as_uri(), ) job = Job.get(Job.source == source) runtime = JobRuntime( log_dir=tmp_path / "out" / "logs", worker_duration_seconds=0.35, worker_stats_interval_seconds=0.05, worker_failure_probability=0.0, ) try: runtime.start() execution_id = runtime.run_job_now(job.id, reason="manual") assert execution_id is not None execution = _wait_for_terminal_execution(execution_id) artifacts = JobArtifacts.for_execution( log_dir=tmp_path / "out" / "logs", job_id=job.id, execution_id=execution_id, ) assert execution.running_status == JobExecutionStatus.SUCCEEDED assert execution.started_at is not None assert execution.ended_at is not None assert execution.requests_count > 0 assert execution.items_count > 0 assert execution.bytes_count > 0 assert artifacts.log_path.exists() assert artifacts.stats_path.exists() output_path = tmp_path / "out" / "manual-source" / "feed.rss" assert output_path.exists() output_text = output_path.read_text(encoding="utf-8") assert "Local Demo Feed" in output_text assert "Local Demo Entry" in output_text stats_lines = [ json.loads(line) for line in artifacts.stats_path.read_text(encoding="utf-8").splitlines() ] assert len(stats_lines) >= 2 assert stats_lines[-1]["requests_count"] == execution.requests_count finally: runtime.shutdown() def test_job_runtime_cancel_marks_execution_canceled(tmp_path: Path) -> None: initialize_database(tmp_path / "cancel.db") with _slow_feed_server() as feed_url: source = create_source( name="Cancelable source", slug="cancelable-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url=feed_url, ) job = Job.get(Job.source == source) runtime = JobRuntime( log_dir=tmp_path / "out" / "logs", worker_duration_seconds=2.0, worker_stats_interval_seconds=0.1, worker_failure_probability=0.0, ) try: runtime.start() execution_id = runtime.run_job_now(job.id, reason="manual") assert execution_id is not None _wait_for_running_execution(execution_id) runtime.request_execution_cancel(execution_id) execution = _wait_for_terminal_execution(execution_id) artifacts = JobArtifacts.for_execution( log_dir=tmp_path / "out" / "logs", job_id=job.id, execution_id=execution_id, ) assert execution.running_status == JobExecutionStatus.CANCELED assert execution.ended_at is not None assert execution.stop_requested_at is not None assert "graceful stop requested" in artifacts.log_path.read_text( encoding="utf-8" ) finally: runtime.shutdown() def test_job_runtime_start_reconciles_stale_running_execution(tmp_path: Path) -> None: initialize_database(tmp_path / "stale-running.db") source = create_source( name="Stale source", slug="stale-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/stale.xml", ) job = Job.get(Job.source == source) execution = JobExecution.create( job=job, started_at="2026-03-30 12:30:00+00:00", running_status=JobExecutionStatus.RUNNING, ) artifacts = JobArtifacts.for_execution( log_dir=tmp_path / "out" / "logs", job_id=job.id, execution_id=int(execution.get_id()), ) artifacts.log_path.parent.mkdir(parents=True, exist_ok=True) artifacts.log_path.write_text( "worker: process lost during app restart\n", encoding="utf-8", ) runtime = JobRuntime( log_dir=tmp_path / "out" / "logs", worker_duration_seconds=0.5, worker_stats_interval_seconds=0.05, worker_failure_probability=0.0, ) try: runtime.start() reconciled_execution = JobExecution.get_by_id(execution.get_id()) assert reconciled_execution.running_status == JobExecutionStatus.FAILED assert reconciled_execution.ended_at is not None assert "marked failed after app restart" in artifacts.log_path.read_text( encoding="utf-8" ) finally: runtime.shutdown() def test_generate_pangea_feed_writes_pangea_rss_file( monkeypatch, tmp_path: Path ) -> None: class StubPangeaFeed: def __init__(self, config, feeds): self.config = config self.feed = feeds[0] def acquire_content(self) -> None: return None def generate_feed(self) -> None: return None def disgorge(self, slug: str): output_path = self.config.results.output_directory / slug / "pangea.rss" output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text( "Pangea Fixture\n", encoding="utf-8", ) return output_path monkeypatch.setattr( "repub.job_runner.pangea_feed_class", lambda: StubPangeaFeed, ) output_path = generate_pangea_feed( name="Pangea source", slug="pangea-source", domain="example.org", category_name="News", content_type="articles", only_newest=True, max_articles=10, oldest_article=3, include_authors=True, exclude_media=False, include_content=True, content_format="MOBILE_3", out_dir=tmp_path / "out", log_path=tmp_path / "out" / "logs" / "pangea.log", ) assert output_path == (tmp_path / "out" / "pangea-source" / "pangea.rss") assert output_path.exists() assert "Pangea Fixture" in output_path.read_text(encoding="utf-8") def test_load_runs_view_humanizes_completed_execution_end_time( monkeypatch, tmp_path: Path ) -> None: db_path = tmp_path / "runs-view.db" log_dir = tmp_path / "out" / "logs" monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) app = create_app() app.config["REPUB_LOG_DIR"] = log_dir source = create_source( name="Completed source", slug="completed-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/completed.xml", ) job = Job.get(Job.source == source) reference_time = datetime(2026, 1, 15, 12, 0, tzinfo=UTC) ended_at = reference_time - timedelta(hours=2) JobExecution.create( job=job, running_status=JobExecutionStatus.SUCCEEDED, ended_at=ended_at, ) view = load_runs_view(log_dir=app.config["REPUB_LOG_DIR"], now=reference_time) completed = view["completed"][0] assert completed["ended_at"] == "2 hours ago" assert completed["ended_at_iso"] == ended_at.isoformat() def test_render_runs_uses_database_backed_jobs_and_executions( monkeypatch, tmp_path: Path ) -> None: db_path = tmp_path / "runs-page.db" log_dir = tmp_path / "out" / "logs" monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) app = create_app() app.config["REPUB_LOG_DIR"] = log_dir app.config["REPUB_JOB_WORKER_DURATION_SECONDS"] = 0.35 app.config["REPUB_JOB_WORKER_STATS_INTERVAL_SECONDS"] = 0.05 app.config["REPUB_JOB_WORKER_FAILURE_PROBABILITY"] = 0.0 source = create_source( name="Runs page source", slug="runs-page-source", source_type="feed", notes="", spider_arguments="", enabled=True, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url=FIXTURE_FEED_PATH.as_uri(), ) job = Job.get(Job.source == source) runtime = get_job_runtime(app) runtime.start() try: execution_id = runtime.run_job_now(job.id, reason="manual") assert execution_id is not None execution = _wait_for_terminal_execution(execution_id) async def run() -> None: body = str(await render_runs(app)) assert "runs-page-source" in body assert "Running job executions" in body assert "Upcoming jobs" in body assert "Completed job executions" in body assert f"/job/{job.id}/execution/{execution.get_id()}/logs" in body assert "Succeeded" in body assert "Run now" in body asyncio.run(run()) finally: runtime.shutdown() def test_render_execution_logs_handles_missing_execution_and_missing_log_file( monkeypatch, tmp_path: Path ) -> None: db_path = tmp_path / "log-errors.db" log_dir = tmp_path / "out" / "logs" monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) app = create_app() app.config["REPUB_LOG_DIR"] = log_dir source = create_source( name="Log source", slug="log-source", source_type="feed", notes="", spider_arguments="", enabled=False, cron_minute="*/5", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/log-source.xml", ) job = Job.get(Job.source == source) execution = JobExecution.create( job=job, running_status=JobExecutionStatus.FAILED, ) async def run() -> None: missing_execution = str( await render_execution_logs(app, job_id=job.id, execution_id=9999) ) missing_log = str( await render_execution_logs(app, job_id=job.id, execution_id=execution.id) ) assert "Execution log unavailable" in missing_execution assert "Execution does not exist." in missing_execution assert "Execution log unavailable" in missing_log assert "Log file has not been created yet." in missing_log asyncio.run(run()) def test_delete_job_action_removes_source_job_and_execution_history( monkeypatch, tmp_path: Path ) -> None: db_path = tmp_path / "delete-job.db" monkeypatch.setenv("REPUBLISHER_DB_PATH", str(db_path)) async def run() -> None: app = create_app() client = app.test_client() source = create_source( name="Delete source", slug="delete-source", source_type="feed", notes="", spider_arguments="", enabled=True, cron_minute="*/30", cron_hour="*", cron_day_of_month="*", cron_day_of_week="*", cron_month="*", feed_url="https://example.com/delete.xml", ) job = Job.get(Job.source == source) execution = JobExecution.create( job=job, running_status=JobExecutionStatus.SUCCEEDED, ) response = await client.post(f"/actions/jobs/{job.id}/delete") assert response.status_code == 204 assert Source.get_or_none(Source.slug == "delete-source") is None assert Job.get_or_none(id=job.id) is None assert JobExecution.get_or_none(id=int(execution.get_id())) is None asyncio.run(run()) def _wait_for_running_execution( execution_id: int, *, timeout_seconds: float = 2.0 ) -> JobExecution: deadline = time.monotonic() + timeout_seconds while time.monotonic() < deadline: execution = JobExecution.get_by_id(execution_id) if execution.running_status == JobExecutionStatus.RUNNING: return execution time.sleep(0.02) raise AssertionError(f"execution {execution_id} never entered RUNNING state") def _wait_for_terminal_execution( execution_id: int, *, timeout_seconds: float = 4.0 ) -> JobExecution: deadline = time.monotonic() + timeout_seconds while time.monotonic() < deadline: execution = JobExecution.get_by_id(execution_id) if execution.running_status in { JobExecutionStatus.SUCCEEDED, JobExecutionStatus.FAILED, JobExecutionStatus.CANCELED, }: return execution time.sleep(0.02) raise AssertionError(f"execution {execution_id} did not finish in time") class _SlowFeedRequestHandler(BaseHTTPRequestHandler): def do_GET(self) -> None: # noqa: N802 time.sleep(2.0) payload = FIXTURE_FEED_PATH.read_bytes() self.send_response(200) self.send_header("Content-Type", "application/rss+xml; charset=utf-8") self.send_header("Content-Length", str(len(payload))) self.end_headers() self.wfile.write(payload) def log_message(self, format: str, *args: object) -> None: del format, args class _ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): allow_reuse_address = True class _slow_feed_server: def __enter__(self) -> str: self._server = _ThreadedTCPServer(("127.0.0.1", 0), _SlowFeedRequestHandler) self._thread = threading.Thread( target=self._server.serve_forever, kwargs={"poll_interval": 0.01}, daemon=True, ) self._thread.start() host = str(self._server.server_address[0]) port = int(self._server.server_address[1]) return f"http://{host}:{port}/slow-feed.rss" def __exit__(self, exc_type, exc, tb) -> None: del exc_type, exc, tb self._server.shutdown() self._server.server_close() self._thread.join(timeout=1)