agent: complete plan05 closeout
This commit is contained in:
parent
33ba248c49
commit
d8c925b817
12 changed files with 1347 additions and 313 deletions
|
|
@ -6,6 +6,7 @@ import argparse
|
|||
import logging
|
||||
import signal
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from types import FrameType
|
||||
|
||||
|
|
@ -25,6 +26,29 @@ from .state_db import StateDB
|
|||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LoopHealth:
|
||||
"""Thread-safe last-success timestamps for daemon loops."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.Lock()
|
||||
self._last_success: dict[str, float] = {}
|
||||
|
||||
def mark_success(self, loop_name: str) -> None:
|
||||
with self._lock:
|
||||
self._last_success[loop_name] = time.monotonic()
|
||||
|
||||
def is_fresh(self, loop_name: str, max_age_seconds: float) -> bool:
|
||||
with self._lock:
|
||||
last = self._last_success.get(loop_name)
|
||||
if last is None:
|
||||
return False
|
||||
return (time.monotonic() - last) <= max_age_seconds
|
||||
|
||||
|
||||
def _max_staleness(interval_seconds: float) -> float:
|
||||
return max(interval_seconds * 3.0, 15.0)
|
||||
|
||||
|
||||
def _scheduler_loop(
|
||||
db: StateDB,
|
||||
runtime: EC2Runtime,
|
||||
|
|
@ -32,10 +56,12 @@ def _scheduler_loop(
|
|||
clock: SystemClock,
|
||||
metrics: MetricsRegistry,
|
||||
stop_event: threading.Event,
|
||||
loop_health: LoopHealth,
|
||||
) -> None:
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
scheduling_tick(db, runtime, config, clock, metrics)
|
||||
loop_health.mark_success("scheduler")
|
||||
except Exception:
|
||||
log.exception("scheduler_tick_failed")
|
||||
stop_event.wait(config.scheduler.tick_seconds)
|
||||
|
|
@ -45,15 +71,36 @@ def _reconciler_loop(
|
|||
reconciler: Reconciler,
|
||||
config: AppConfig,
|
||||
stop_event: threading.Event,
|
||||
loop_health: LoopHealth,
|
||||
reconcile_lock: threading.Lock,
|
||||
) -> None:
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
reconciler.tick()
|
||||
with reconcile_lock:
|
||||
reconciler.tick()
|
||||
loop_health.mark_success("reconciler")
|
||||
except Exception:
|
||||
log.exception("reconciler_tick_failed")
|
||||
stop_event.wait(config.scheduler.reconcile_seconds)
|
||||
|
||||
|
||||
def _metrics_health_loop(
|
||||
metrics: MetricsRegistry,
|
||||
stop_event: threading.Event,
|
||||
loop_health: LoopHealth,
|
||||
interval_seconds: float,
|
||||
) -> None:
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
metrics.gauge("autoscaler_loop_up", {"loop": "scheduler"}, 1.0)
|
||||
metrics.gauge("autoscaler_loop_up", {"loop": "reconciler"}, 1.0)
|
||||
metrics.gauge("autoscaler_loop_up", {"loop": "metrics"}, 1.0)
|
||||
loop_health.mark_success("metrics")
|
||||
except Exception:
|
||||
log.exception("metrics_health_tick_failed")
|
||||
stop_event.wait(interval_seconds)
|
||||
|
||||
|
||||
def _parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="nix-builder-autoscaler",
|
||||
|
|
@ -92,7 +139,11 @@ def main() -> None:
|
|||
stop_event = threading.Event()
|
||||
scheduler_thread: threading.Thread | None = None
|
||||
reconciler_thread: threading.Thread | None = None
|
||||
metrics_thread: threading.Thread | None = None
|
||||
server: uvicorn.Server | None = None
|
||||
loop_health = LoopHealth()
|
||||
reconcile_lock = threading.Lock()
|
||||
metrics_interval = 5.0
|
||||
|
||||
def scheduler_running() -> bool:
|
||||
return scheduler_thread is not None and scheduler_thread.is_alive()
|
||||
|
|
@ -100,6 +151,32 @@ def main() -> None:
|
|||
def reconciler_running() -> bool:
|
||||
return reconciler_thread is not None and reconciler_thread.is_alive()
|
||||
|
||||
def metrics_running() -> bool:
|
||||
return metrics_thread is not None and metrics_thread.is_alive()
|
||||
|
||||
def ready_check() -> bool:
|
||||
checks = [
|
||||
("scheduler", scheduler_running(), _max_staleness(config.scheduler.tick_seconds)),
|
||||
(
|
||||
"reconciler",
|
||||
reconciler_running(),
|
||||
_max_staleness(config.scheduler.reconcile_seconds),
|
||||
),
|
||||
("metrics", metrics_running(), _max_staleness(metrics_interval)),
|
||||
]
|
||||
for loop_name, alive, max_age in checks:
|
||||
if not alive:
|
||||
return False
|
||||
if not loop_health.is_fresh(loop_name, max_age):
|
||||
return False
|
||||
return True
|
||||
|
||||
def reconcile_now() -> dict[str, object]:
|
||||
with reconcile_lock:
|
||||
reconciler.tick()
|
||||
loop_health.mark_success("reconciler")
|
||||
return {"triggered": True}
|
||||
|
||||
app = create_app(
|
||||
db,
|
||||
config,
|
||||
|
|
@ -109,23 +186,36 @@ def main() -> None:
|
|||
haproxy=haproxy,
|
||||
scheduler_running=scheduler_running,
|
||||
reconciler_running=reconciler_running,
|
||||
ready_check=ready_check,
|
||||
reconcile_now=reconcile_now,
|
||||
)
|
||||
|
||||
loop_health.mark_success("scheduler")
|
||||
loop_health.mark_success("reconciler")
|
||||
loop_health.mark_success("metrics")
|
||||
|
||||
scheduler_thread = threading.Thread(
|
||||
target=_scheduler_loop,
|
||||
name="autoscaler-scheduler",
|
||||
args=(db, runtime, config, clock, metrics, stop_event),
|
||||
args=(db, runtime, config, clock, metrics, stop_event, loop_health),
|
||||
daemon=True,
|
||||
)
|
||||
reconciler_thread = threading.Thread(
|
||||
target=_reconciler_loop,
|
||||
name="autoscaler-reconciler",
|
||||
args=(reconciler, config, stop_event),
|
||||
args=(reconciler, config, stop_event, loop_health, reconcile_lock),
|
||||
daemon=True,
|
||||
)
|
||||
metrics_thread = threading.Thread(
|
||||
target=_metrics_health_loop,
|
||||
name="autoscaler-metrics-health",
|
||||
args=(metrics, stop_event, loop_health, metrics_interval),
|
||||
daemon=True,
|
||||
)
|
||||
|
||||
scheduler_thread.start()
|
||||
reconciler_thread.start()
|
||||
metrics_thread.start()
|
||||
|
||||
socket_path = Path(config.server.socket_path)
|
||||
socket_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -156,6 +246,8 @@ def main() -> None:
|
|||
scheduler_thread.join(timeout=10)
|
||||
if reconciler_thread is not None:
|
||||
reconciler_thread.join(timeout=10)
|
||||
if metrics_thread is not None:
|
||||
metrics_thread.join(timeout=10)
|
||||
db.close()
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue