"""Daemon entry point: python -m nix_builder_autoscaler.""" from __future__ import annotations import argparse import logging import signal import threading import time from pathlib import Path from types import FrameType import uvicorn from .api import create_app from .config import AppConfig, load_config from .logging import setup_logging from .metrics import MetricsRegistry from .providers.clock import SystemClock from .providers.haproxy import HAProxyRuntime from .reconciler import Reconciler from .runtime.ec2 import EC2Runtime from .scheduler import scheduling_tick from .state_db import StateDB log = logging.getLogger(__name__) class LoopHealth: """Thread-safe last-success timestamps for daemon loops.""" def __init__(self) -> None: self._lock = threading.Lock() self._last_success: dict[str, float] = {} def mark_success(self, loop_name: str) -> None: with self._lock: self._last_success[loop_name] = time.monotonic() def is_fresh(self, loop_name: str, max_age_seconds: float) -> bool: with self._lock: last = self._last_success.get(loop_name) if last is None: return False return (time.monotonic() - last) <= max_age_seconds def _max_staleness(interval_seconds: float) -> float: return max(interval_seconds * 3.0, 15.0) def _scheduler_loop( db: StateDB, runtime: EC2Runtime, config: AppConfig, clock: SystemClock, metrics: MetricsRegistry, stop_event: threading.Event, loop_health: LoopHealth, ) -> None: while not stop_event.is_set(): try: scheduling_tick(db, runtime, config, clock, metrics) loop_health.mark_success("scheduler") except Exception: log.exception("scheduler_tick_failed") stop_event.wait(config.scheduler.tick_seconds) def _reconciler_loop( reconciler: Reconciler, config: AppConfig, stop_event: threading.Event, loop_health: LoopHealth, reconcile_lock: threading.Lock, ) -> None: while not stop_event.is_set(): try: with reconcile_lock: reconciler.tick() loop_health.mark_success("reconciler") except Exception: log.exception("reconciler_tick_failed") stop_event.wait(config.scheduler.reconcile_seconds) def _metrics_health_loop( metrics: MetricsRegistry, stop_event: threading.Event, loop_health: LoopHealth, interval_seconds: float, ) -> None: while not stop_event.is_set(): try: metrics.gauge("autoscaler_loop_up", {"loop": "scheduler"}, 1.0) metrics.gauge("autoscaler_loop_up", {"loop": "reconciler"}, 1.0) metrics.gauge("autoscaler_loop_up", {"loop": "metrics"}, 1.0) loop_health.mark_success("metrics") except Exception: log.exception("metrics_health_tick_failed") stop_event.wait(interval_seconds) def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( prog="nix-builder-autoscaler", description="Nix builder autoscaler daemon", ) parser.add_argument("--config", required=True, help="Path to TOML config file") return parser.parse_args() def main() -> None: """Parse config, initialize components, and run the daemon.""" args = _parse_args() config = load_config(Path(args.config)) setup_logging(config.server.log_level) clock = SystemClock() db = StateDB(config.server.db_path, clock=clock) db.init_schema() db.init_slots( config.haproxy.slot_prefix, config.haproxy.slot_count, config.capacity.default_system, config.haproxy.backend, ) runtime = EC2Runtime(config.aws) haproxy = HAProxyRuntime( config.haproxy.runtime_socket, config.haproxy.backend, config.haproxy.slot_prefix, ) metrics = MetricsRegistry() reconciler = Reconciler(db, runtime, haproxy, config, clock, metrics) reconciler.tick() stop_event = threading.Event() scheduler_thread: threading.Thread | None = None reconciler_thread: threading.Thread | None = None metrics_thread: threading.Thread | None = None server: uvicorn.Server | None = None loop_health = LoopHealth() reconcile_lock = threading.Lock() metrics_interval = 5.0 def scheduler_running() -> bool: return scheduler_thread is not None and scheduler_thread.is_alive() def reconciler_running() -> bool: return reconciler_thread is not None and reconciler_thread.is_alive() def metrics_running() -> bool: return metrics_thread is not None and metrics_thread.is_alive() def ready_check() -> bool: checks = [ ("scheduler", scheduler_running(), _max_staleness(config.scheduler.tick_seconds)), ( "reconciler", reconciler_running(), _max_staleness(config.scheduler.reconcile_seconds), ), ("metrics", metrics_running(), _max_staleness(metrics_interval)), ] for loop_name, alive, max_age in checks: if not alive: return False if not loop_health.is_fresh(loop_name, max_age): return False return True def reconcile_now() -> dict[str, object]: with reconcile_lock: reconciler.tick() loop_health.mark_success("reconciler") return {"triggered": True} app = create_app( db, config, clock, metrics, runtime=runtime, haproxy=haproxy, scheduler_running=scheduler_running, reconciler_running=reconciler_running, ready_check=ready_check, reconcile_now=reconcile_now, ) loop_health.mark_success("scheduler") loop_health.mark_success("reconciler") loop_health.mark_success("metrics") scheduler_thread = threading.Thread( target=_scheduler_loop, name="autoscaler-scheduler", args=(db, runtime, config, clock, metrics, stop_event, loop_health), daemon=True, ) reconciler_thread = threading.Thread( target=_reconciler_loop, name="autoscaler-reconciler", args=(reconciler, config, stop_event, loop_health, reconcile_lock), daemon=True, ) metrics_thread = threading.Thread( target=_metrics_health_loop, name="autoscaler-metrics-health", args=(metrics, stop_event, loop_health, metrics_interval), daemon=True, ) scheduler_thread.start() reconciler_thread.start() metrics_thread.start() socket_path = Path(config.server.socket_path) socket_path.parent.mkdir(parents=True, exist_ok=True) if socket_path.exists(): socket_path.unlink() uvicorn_config = uvicorn.Config( app=app, uds=config.server.socket_path, log_level=config.server.log_level.lower(), ) server = uvicorn.Server(uvicorn_config) def _handle_signal(signum: int, _: FrameType | None) -> None: log.info("shutdown_signal", extra={"signal": signum}) stop_event.set() if server is not None: server.should_exit = True signal.signal(signal.SIGTERM, _handle_signal) signal.signal(signal.SIGINT, _handle_signal) try: server.run() finally: stop_event.set() if scheduler_thread is not None: scheduler_thread.join(timeout=10) if reconciler_thread is not None: reconciler_thread.join(timeout=10) if metrics_thread is not None: metrics_thread.join(timeout=10) db.close() if __name__ == "__main__": main()