2026-02-27 12:46:32 +01:00
|
|
|
"""Daemon entry point: python -m nix_builder_autoscaler."""
|
2026-02-27 11:59:16 +01:00
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import argparse
|
2026-02-27 12:46:32 +01:00
|
|
|
import logging
|
|
|
|
|
import signal
|
|
|
|
|
import threading
|
2026-02-27 13:48:52 +01:00
|
|
|
import time
|
2026-02-27 12:46:32 +01:00
|
|
|
from pathlib import Path
|
|
|
|
|
from types import FrameType
|
2026-02-27 11:59:16 +01:00
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
import uvicorn
|
2026-02-27 11:59:16 +01:00
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
from .api import create_app
|
|
|
|
|
from .config import AppConfig, load_config
|
|
|
|
|
from .logging import setup_logging
|
|
|
|
|
from .metrics import MetricsRegistry
|
|
|
|
|
from .providers.clock import SystemClock
|
|
|
|
|
from .providers.haproxy import HAProxyRuntime
|
|
|
|
|
from .reconciler import Reconciler
|
|
|
|
|
from .runtime.ec2 import EC2Runtime
|
|
|
|
|
from .scheduler import scheduling_tick
|
|
|
|
|
from .state_db import StateDB
|
|
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
2026-02-27 13:48:52 +01:00
|
|
|
class LoopHealth:
|
|
|
|
|
"""Thread-safe last-success timestamps for daemon loops."""
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
self._lock = threading.Lock()
|
|
|
|
|
self._last_success: dict[str, float] = {}
|
|
|
|
|
|
|
|
|
|
def mark_success(self, loop_name: str) -> None:
|
|
|
|
|
with self._lock:
|
|
|
|
|
self._last_success[loop_name] = time.monotonic()
|
|
|
|
|
|
|
|
|
|
def is_fresh(self, loop_name: str, max_age_seconds: float) -> bool:
|
|
|
|
|
with self._lock:
|
|
|
|
|
last = self._last_success.get(loop_name)
|
|
|
|
|
if last is None:
|
|
|
|
|
return False
|
|
|
|
|
return (time.monotonic() - last) <= max_age_seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _max_staleness(interval_seconds: float) -> float:
|
|
|
|
|
return max(interval_seconds * 3.0, 15.0)
|
|
|
|
|
|
|
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
def _scheduler_loop(
|
|
|
|
|
db: StateDB,
|
|
|
|
|
runtime: EC2Runtime,
|
|
|
|
|
config: AppConfig,
|
|
|
|
|
clock: SystemClock,
|
|
|
|
|
metrics: MetricsRegistry,
|
|
|
|
|
stop_event: threading.Event,
|
2026-02-27 13:48:52 +01:00
|
|
|
loop_health: LoopHealth,
|
2026-02-27 12:46:32 +01:00
|
|
|
) -> None:
|
|
|
|
|
while not stop_event.is_set():
|
|
|
|
|
try:
|
|
|
|
|
scheduling_tick(db, runtime, config, clock, metrics)
|
2026-02-27 13:48:52 +01:00
|
|
|
loop_health.mark_success("scheduler")
|
2026-02-27 12:46:32 +01:00
|
|
|
except Exception:
|
|
|
|
|
log.exception("scheduler_tick_failed")
|
|
|
|
|
stop_event.wait(config.scheduler.tick_seconds)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _reconciler_loop(
|
|
|
|
|
reconciler: Reconciler,
|
|
|
|
|
config: AppConfig,
|
|
|
|
|
stop_event: threading.Event,
|
2026-02-27 13:48:52 +01:00
|
|
|
loop_health: LoopHealth,
|
|
|
|
|
reconcile_lock: threading.Lock,
|
2026-02-27 12:46:32 +01:00
|
|
|
) -> None:
|
|
|
|
|
while not stop_event.is_set():
|
|
|
|
|
try:
|
2026-02-27 13:48:52 +01:00
|
|
|
with reconcile_lock:
|
|
|
|
|
reconciler.tick()
|
|
|
|
|
loop_health.mark_success("reconciler")
|
2026-02-27 12:46:32 +01:00
|
|
|
except Exception:
|
|
|
|
|
log.exception("reconciler_tick_failed")
|
|
|
|
|
stop_event.wait(config.scheduler.reconcile_seconds)
|
|
|
|
|
|
|
|
|
|
|
2026-02-27 13:48:52 +01:00
|
|
|
def _metrics_health_loop(
|
|
|
|
|
metrics: MetricsRegistry,
|
|
|
|
|
stop_event: threading.Event,
|
|
|
|
|
loop_health: LoopHealth,
|
|
|
|
|
interval_seconds: float,
|
|
|
|
|
) -> None:
|
|
|
|
|
while not stop_event.is_set():
|
|
|
|
|
try:
|
|
|
|
|
metrics.gauge("autoscaler_loop_up", {"loop": "scheduler"}, 1.0)
|
|
|
|
|
metrics.gauge("autoscaler_loop_up", {"loop": "reconciler"}, 1.0)
|
|
|
|
|
metrics.gauge("autoscaler_loop_up", {"loop": "metrics"}, 1.0)
|
|
|
|
|
loop_health.mark_success("metrics")
|
|
|
|
|
except Exception:
|
|
|
|
|
log.exception("metrics_health_tick_failed")
|
|
|
|
|
stop_event.wait(interval_seconds)
|
|
|
|
|
|
|
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
def _parse_args() -> argparse.Namespace:
|
2026-02-27 11:59:16 +01:00
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
prog="nix-builder-autoscaler",
|
|
|
|
|
description="Nix builder autoscaler daemon",
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument("--config", required=True, help="Path to TOML config file")
|
2026-02-27 12:46:32 +01:00
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
"""Parse config, initialize components, and run the daemon."""
|
|
|
|
|
args = _parse_args()
|
|
|
|
|
config = load_config(Path(args.config))
|
|
|
|
|
setup_logging(config.server.log_level)
|
|
|
|
|
|
|
|
|
|
clock = SystemClock()
|
|
|
|
|
db = StateDB(config.server.db_path, clock=clock)
|
|
|
|
|
db.init_schema()
|
|
|
|
|
db.init_slots(
|
|
|
|
|
config.haproxy.slot_prefix,
|
|
|
|
|
config.haproxy.slot_count,
|
|
|
|
|
config.capacity.default_system,
|
|
|
|
|
config.haproxy.backend,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
runtime = EC2Runtime(config.aws)
|
|
|
|
|
haproxy = HAProxyRuntime(
|
|
|
|
|
config.haproxy.runtime_socket,
|
|
|
|
|
config.haproxy.backend,
|
|
|
|
|
config.haproxy.slot_prefix,
|
|
|
|
|
)
|
|
|
|
|
metrics = MetricsRegistry()
|
|
|
|
|
reconciler = Reconciler(db, runtime, haproxy, config, clock, metrics)
|
|
|
|
|
reconciler.tick()
|
|
|
|
|
|
|
|
|
|
stop_event = threading.Event()
|
|
|
|
|
scheduler_thread: threading.Thread | None = None
|
|
|
|
|
reconciler_thread: threading.Thread | None = None
|
2026-02-27 13:48:52 +01:00
|
|
|
metrics_thread: threading.Thread | None = None
|
2026-02-27 12:46:32 +01:00
|
|
|
server: uvicorn.Server | None = None
|
2026-02-27 13:48:52 +01:00
|
|
|
loop_health = LoopHealth()
|
|
|
|
|
reconcile_lock = threading.Lock()
|
|
|
|
|
metrics_interval = 5.0
|
2026-02-27 12:46:32 +01:00
|
|
|
|
|
|
|
|
def scheduler_running() -> bool:
|
|
|
|
|
return scheduler_thread is not None and scheduler_thread.is_alive()
|
|
|
|
|
|
|
|
|
|
def reconciler_running() -> bool:
|
|
|
|
|
return reconciler_thread is not None and reconciler_thread.is_alive()
|
|
|
|
|
|
2026-02-27 13:48:52 +01:00
|
|
|
def metrics_running() -> bool:
|
|
|
|
|
return metrics_thread is not None and metrics_thread.is_alive()
|
|
|
|
|
|
|
|
|
|
def ready_check() -> bool:
|
|
|
|
|
checks = [
|
|
|
|
|
("scheduler", scheduler_running(), _max_staleness(config.scheduler.tick_seconds)),
|
|
|
|
|
(
|
|
|
|
|
"reconciler",
|
|
|
|
|
reconciler_running(),
|
|
|
|
|
_max_staleness(config.scheduler.reconcile_seconds),
|
|
|
|
|
),
|
|
|
|
|
("metrics", metrics_running(), _max_staleness(metrics_interval)),
|
|
|
|
|
]
|
|
|
|
|
for loop_name, alive, max_age in checks:
|
|
|
|
|
if not alive:
|
|
|
|
|
return False
|
|
|
|
|
if not loop_health.is_fresh(loop_name, max_age):
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def reconcile_now() -> dict[str, object]:
|
|
|
|
|
with reconcile_lock:
|
|
|
|
|
reconciler.tick()
|
|
|
|
|
loop_health.mark_success("reconciler")
|
|
|
|
|
return {"triggered": True}
|
|
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
app = create_app(
|
|
|
|
|
db,
|
|
|
|
|
config,
|
|
|
|
|
clock,
|
|
|
|
|
metrics,
|
|
|
|
|
runtime=runtime,
|
|
|
|
|
haproxy=haproxy,
|
|
|
|
|
scheduler_running=scheduler_running,
|
|
|
|
|
reconciler_running=reconciler_running,
|
2026-02-27 13:48:52 +01:00
|
|
|
ready_check=ready_check,
|
|
|
|
|
reconcile_now=reconcile_now,
|
2026-02-27 12:46:32 +01:00
|
|
|
)
|
|
|
|
|
|
2026-02-27 13:48:52 +01:00
|
|
|
loop_health.mark_success("scheduler")
|
|
|
|
|
loop_health.mark_success("reconciler")
|
|
|
|
|
loop_health.mark_success("metrics")
|
|
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
scheduler_thread = threading.Thread(
|
|
|
|
|
target=_scheduler_loop,
|
|
|
|
|
name="autoscaler-scheduler",
|
2026-02-27 13:48:52 +01:00
|
|
|
args=(db, runtime, config, clock, metrics, stop_event, loop_health),
|
2026-02-27 12:46:32 +01:00
|
|
|
daemon=True,
|
|
|
|
|
)
|
|
|
|
|
reconciler_thread = threading.Thread(
|
|
|
|
|
target=_reconciler_loop,
|
|
|
|
|
name="autoscaler-reconciler",
|
2026-02-27 13:48:52 +01:00
|
|
|
args=(reconciler, config, stop_event, loop_health, reconcile_lock),
|
|
|
|
|
daemon=True,
|
|
|
|
|
)
|
|
|
|
|
metrics_thread = threading.Thread(
|
|
|
|
|
target=_metrics_health_loop,
|
|
|
|
|
name="autoscaler-metrics-health",
|
|
|
|
|
args=(metrics, stop_event, loop_health, metrics_interval),
|
2026-02-27 12:46:32 +01:00
|
|
|
daemon=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
scheduler_thread.start()
|
|
|
|
|
reconciler_thread.start()
|
2026-02-27 13:48:52 +01:00
|
|
|
metrics_thread.start()
|
2026-02-27 12:46:32 +01:00
|
|
|
|
|
|
|
|
socket_path = Path(config.server.socket_path)
|
|
|
|
|
socket_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
if socket_path.exists():
|
|
|
|
|
socket_path.unlink()
|
|
|
|
|
|
|
|
|
|
uvicorn_config = uvicorn.Config(
|
|
|
|
|
app=app,
|
|
|
|
|
uds=config.server.socket_path,
|
|
|
|
|
log_level=config.server.log_level.lower(),
|
|
|
|
|
)
|
|
|
|
|
server = uvicorn.Server(uvicorn_config)
|
|
|
|
|
|
|
|
|
|
def _handle_signal(signum: int, _: FrameType | None) -> None:
|
|
|
|
|
log.info("shutdown_signal", extra={"signal": signum})
|
|
|
|
|
stop_event.set()
|
|
|
|
|
if server is not None:
|
|
|
|
|
server.should_exit = True
|
|
|
|
|
|
|
|
|
|
signal.signal(signal.SIGTERM, _handle_signal)
|
|
|
|
|
signal.signal(signal.SIGINT, _handle_signal)
|
2026-02-27 11:59:16 +01:00
|
|
|
|
2026-02-27 12:46:32 +01:00
|
|
|
try:
|
|
|
|
|
server.run()
|
|
|
|
|
finally:
|
|
|
|
|
stop_event.set()
|
|
|
|
|
if scheduler_thread is not None:
|
|
|
|
|
scheduler_thread.join(timeout=10)
|
|
|
|
|
if reconciler_thread is not None:
|
|
|
|
|
reconciler_thread.join(timeout=10)
|
2026-02-27 13:48:52 +01:00
|
|
|
if metrics_thread is not None:
|
|
|
|
|
metrics_thread.join(timeout=10)
|
2026-02-27 12:46:32 +01:00
|
|
|
db.close()
|
2026-02-27 11:59:16 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|