add runtime adapters, scheduler, reconciler, and their unit tests

2026-02-27 12:34:32 +01:00 · 2026-02-27 12:34:32 +01:00 · b63d69c81d
commit b63d69c81d
parent d1976a5fd8
10 changed files with 1471 additions and 28 deletions
--- a/agent/nix_builder_autoscaler/reconciler.py
+++ b/agent/nix_builder_autoscaler/reconciler.py
@ -1 +1,258 @@
-"""Reconciler — stub for Plan 03."""
+"""Reconciler — advances slots through the state machine.
+
+Each tick queries EC2 and HAProxy, then processes each slot according to
+its current state: launching→booting→binding→ready, with draining and
+terminating paths for teardown.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+import time
+from datetime import datetime
+from typing import TYPE_CHECKING
+
+from .models import SlotState
+from .providers.haproxy import HAProxyError
+
+if TYPE_CHECKING:
+    from .config import AppConfig
+    from .metrics import MetricsRegistry
+    from .providers.clock import Clock
+    from .providers.haproxy import HAProxyRuntime
+    from .runtime.base import RuntimeAdapter
+    from .state_db import StateDB
+
+log = logging.getLogger(__name__)
+
+
+class Reconciler:
+    """Advances slots through the state machine by polling EC2 and HAProxy.
+
+    Maintains binding health-check counters between ticks.
+    """
+
+    def __init__(
+        self,
+        db: StateDB,
+        runtime: RuntimeAdapter,
+        haproxy: HAProxyRuntime,
+        config: AppConfig,
+        clock: Clock,
+        metrics: MetricsRegistry,
+    ) -> None:
+        self._db = db
+        self._runtime = runtime
+        self._haproxy = haproxy
+        self._config = config
+        self._clock = clock
+        self._metrics = metrics
+        self._binding_up_counts: dict[str, int] = {}
+
+    def tick(self) -> None:
+        """Execute one reconciliation tick."""
+        t0 = time.monotonic()
+
+        # 1. Query EC2
+        try:
+            managed = self._runtime.list_managed_instances()
+        except Exception:
+            log.exception("ec2_list_failed")
+            managed = []
+        ec2_by_slot: dict[str, dict] = {}
+        for inst in managed:
+            sid = inst.get("slot_id")
+            if sid:
+                ec2_by_slot[sid] = inst
+
+        # 2. Query HAProxy
+        try:
+            haproxy_health = self._haproxy.read_slot_health()
+        except HAProxyError:
+            log.warning("haproxy_stat_failed", exc_info=True)
+            haproxy_health = {}
+
+        # 3. Process each slot
+        all_slots = self._db.list_slots()
+        for slot in all_slots:
+            state = slot["state"]
+            if state == SlotState.LAUNCHING.value:
+                self._handle_launching(slot)
+            elif state == SlotState.BOOTING.value:
+                self._handle_booting(slot)
+            elif state == SlotState.BINDING.value:
+                self._handle_binding(slot, haproxy_health)
+            elif state == SlotState.READY.value:
+                self._handle_ready(slot, ec2_by_slot)
+            elif state == SlotState.DRAINING.value:
+                self._handle_draining(slot)
+            elif state == SlotState.TERMINATING.value:
+                self._handle_terminating(slot, ec2_by_slot)
+
+        # 4. Clean stale binding counters
+        binding_ids = {s["slot_id"] for s in all_slots if s["state"] == SlotState.BINDING.value}
+        stale = [k for k in self._binding_up_counts if k not in binding_ids]
+        for k in stale:
+            del self._binding_up_counts[k]
+
+        # 5. Emit metrics
+        tick_duration = time.monotonic() - t0
+        self._update_metrics(tick_duration)
+
+    def _handle_launching(self, slot: dict) -> None:
+        """Check if launching instance has reached running state."""
+        instance_id = slot["instance_id"]
+        if not instance_id:
+            self._db.update_slot_state(slot["slot_id"], SlotState.ERROR)
+            return
+
+        info = self._runtime.describe_instance(instance_id)
+        ec2_state = info["state"]
+
+        if ec2_state == "running":
+            self._db.update_slot_state(slot["slot_id"], SlotState.BOOTING)
+            log.info("slot_booting", extra={"slot_id": slot["slot_id"]})
+        elif ec2_state in ("terminated", "shutting-down"):
+            self._db.update_slot_state(slot["slot_id"], SlotState.ERROR)
+            log.warning(
+                "slot_launch_terminated",
+                extra={"slot_id": slot["slot_id"], "ec2_state": ec2_state},
+            )
+
+    def _handle_booting(self, slot: dict) -> None:
+        """Check if booting instance has a Tailscale IP yet."""
+        instance_id = slot["instance_id"]
+        if not instance_id:
+            self._db.update_slot_state(slot["slot_id"], SlotState.ERROR)
+            return
+
+        info = self._runtime.describe_instance(instance_id)
+        ec2_state = info["state"]
+
+        if ec2_state in ("terminated", "shutting-down"):
+            self._db.update_slot_state(slot["slot_id"], SlotState.ERROR)
+            log.warning(
+                "slot_boot_terminated",
+                extra={"slot_id": slot["slot_id"], "ec2_state": ec2_state},
+            )
+            return
+
+        tailscale_ip = info.get("tailscale_ip")
+        if tailscale_ip is not None:
+            self._db.update_slot_state(slot["slot_id"], SlotState.BINDING, instance_ip=tailscale_ip)
+            try:
+                self._haproxy.set_slot_addr(slot["slot_id"], tailscale_ip)
+                self._haproxy.enable_slot(slot["slot_id"])
+            except HAProxyError:
+                log.warning(
+                    "haproxy_binding_setup_failed",
+                    extra={"slot_id": slot["slot_id"]},
+                    exc_info=True,
+                )
+
+    def _handle_binding(self, slot: dict, haproxy_health: dict) -> None:
+        """Check HAProxy health to determine when slot is ready."""
+        slot_id = slot["slot_id"]
+        health = haproxy_health.get(slot_id)
+
+        if health is not None and health.status == "UP":
+            count = self._binding_up_counts.get(slot_id, 0) + 1
+            self._binding_up_counts[slot_id] = count
+            if count >= self._config.haproxy.check_ready_up_count:
+                self._db.update_slot_state(slot_id, SlotState.READY)
+                self._binding_up_counts.pop(slot_id, None)
+                log.info("slot_ready", extra={"slot_id": slot_id})
+        else:
+            self._binding_up_counts[slot_id] = 0
+            # Retry HAProxy setup
+            ip = slot.get("instance_ip")
+            if ip:
+                try:
+                    self._haproxy.set_slot_addr(slot_id, ip)
+                    self._haproxy.enable_slot(slot_id)
+                except HAProxyError:
+                    pass
+
+        # Check if instance is still alive
+        instance_id = slot.get("instance_id")
+        if instance_id:
+            info = self._runtime.describe_instance(instance_id)
+            if info["state"] in ("terminated", "shutting-down"):
+                self._db.update_slot_state(slot_id, SlotState.ERROR)
+                self._binding_up_counts.pop(slot_id, None)
+                log.warning(
+                    "slot_binding_terminated",
+                    extra={"slot_id": slot_id},
+                )
+
+    def _handle_ready(self, slot: dict, ec2_by_slot: dict[str, dict]) -> None:
+        """Verify EC2 instance is still alive for ready slots."""
+        slot_id = slot["slot_id"]
+        ec2_info = ec2_by_slot.get(slot_id)
+
+        if ec2_info is None or ec2_info["state"] == "terminated":
+            self._db.update_slot_state(slot_id, SlotState.ERROR, instance_id=None, instance_ip=None)
+            log.warning("slot_ready_instance_gone", extra={"slot_id": slot_id})
+        elif ec2_info["state"] == "shutting-down":
+            self._db.update_slot_fields(slot_id, interruption_pending=1)
+            log.info("slot_interruption_detected", extra={"slot_id": slot_id})
+
+    def _handle_draining(self, slot: dict) -> None:
+        """Disable HAProxy and terminate when drain conditions are met."""
+        slot_id = slot["slot_id"]
+
+        # Disable HAProxy (idempotent)
+        with contextlib.suppress(HAProxyError):
+            self._haproxy.disable_slot(slot_id)
+
+        now = self._clock.now()
+        last_change = datetime.fromisoformat(slot["last_state_change"])
+        drain_duration = (now - last_change).total_seconds()
+
+        drain_timeout = self._config.capacity.drain_timeout_seconds
+        if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
+            instance_id = slot.get("instance_id")
+            if instance_id:
+                try:
+                    self._runtime.terminate_instance(instance_id)
+                    self._metrics.counter("autoscaler_ec2_terminate_total", {}, 1.0)
+                except Exception:
+                    log.warning(
+                        "terminate_failed",
+                        extra={"slot_id": slot_id, "instance_id": instance_id},
+                        exc_info=True,
+                    )
+            self._db.update_slot_state(slot_id, SlotState.TERMINATING)
+            log.info(
+                "slot_terminating",
+                extra={"slot_id": slot_id, "drain_duration": drain_duration},
+            )
+
+    def _handle_terminating(self, slot: dict, ec2_by_slot: dict[str, dict]) -> None:
+        """Wait for EC2 to confirm termination, then reset slot to empty."""
+        slot_id = slot["slot_id"]
+        instance_id = slot.get("instance_id")
+
+        if not instance_id:
+            self._db.update_slot_state(
+                slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
+            )
+            log.info("slot_emptied", extra={"slot_id": slot_id})
+            return
+
+        info = self._runtime.describe_instance(instance_id)
+        if info["state"] == "terminated":
+            self._db.update_slot_state(
+                slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
+            )
+            log.info("slot_emptied", extra={"slot_id": slot_id})
+
+    def _update_metrics(self, tick_duration: float) -> None:
+        """Emit reconciler metrics."""
+        summary = self._db.get_state_summary()
+        for state, count in summary["slots"].items():
+            if state == "total":
+                continue
+            self._metrics.gauge("autoscaler_slots", {"state": state}, float(count))
+        self._metrics.histogram_observe("autoscaler_reconciler_tick_seconds", {}, tick_duration)