agent: complete plan05 closeout

2026-02-27 13:48:52 +01:00 · 2026-02-27 13:48:52 +01:00 · 2f0fffa905
commit 2f0fffa905
parent 33ba248c49
12 changed files with 1347 additions and 313 deletions
--- a/agent/nix_builder_autoscaler/tests/integration/test_end_to_end_fake.py
+++ b/agent/nix_builder_autoscaler/tests/integration/test_end_to_end_fake.py
@ -1 +1,407 @@
-"""End-to-end integration tests with FakeRuntime — Plan 05."""
+"""End-to-end integration tests with FakeRuntime and a fake HAProxy socket."""
+
+from __future__ import annotations
+
+import socket
+import threading
+import time
+from pathlib import Path
+
+from fastapi.testclient import TestClient
+
+from nix_builder_autoscaler.api import create_app
+from nix_builder_autoscaler.config import (
+    AppConfig,
+    AwsConfig,
+    CapacityConfig,
+    HaproxyConfig,
+    SchedulerConfig,
+)
+from nix_builder_autoscaler.metrics import MetricsRegistry
+from nix_builder_autoscaler.models import SlotState
+from nix_builder_autoscaler.providers.clock import FakeClock
+from nix_builder_autoscaler.providers.haproxy import HAProxyRuntime
+from nix_builder_autoscaler.reconciler import Reconciler
+from nix_builder_autoscaler.runtime.fake import FakeRuntime
+from nix_builder_autoscaler.scheduler import scheduling_tick
+from nix_builder_autoscaler.state_db import StateDB
+
+
+class FakeHAProxySocketServer:
+    """Tiny fake HAProxy runtime socket server for integration tests."""
+
+    def __init__(self, socket_path: Path, backend: str, slot_ids: list[str]) -> None:
+        self._socket_path = socket_path
+        self._backend = backend
+        self._slot_ids = slot_ids
+        self._stop_event = threading.Event()
+        self._thread: threading.Thread | None = None
+        self._lock = threading.Lock()
+        self._state: dict[str, dict[str, object]] = {
+            slot_id: {
+                "enabled": False,
+                "addr": "0.0.0.0",
+                "port": 22,
+                "status": "MAINT",
+                "scur": 0,
+                "qcur": 0,
+            }
+            for slot_id in slot_ids
+        }
+
+    def start(self) -> None:
+        self._thread = threading.Thread(target=self._serve, name="fake-haproxy", daemon=True)
+        self._thread.start()
+        deadline = time.time() + 2.0
+        while time.time() < deadline:
+            if self._socket_path.exists():
+                return
+            time.sleep(0.01)
+        msg = f"fake haproxy socket not created: {self._socket_path}"
+        raise RuntimeError(msg)
+
+    def stop(self) -> None:
+        self._stop_event.set()
+        try:
+            with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as sock:
+                sock.connect(str(self._socket_path))
+                sock.sendall(b"\n")
+        except OSError:
+            pass
+        if self._thread is not None:
+            self._thread.join(timeout=2.0)
+        if self._socket_path.exists():
+            self._socket_path.unlink()
+
+    def _serve(self) -> None:
+        if self._socket_path.exists():
+            self._socket_path.unlink()
+
+        with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as server:
+            server.bind(str(self._socket_path))
+            server.listen(16)
+            server.settimeout(0.2)
+            while not self._stop_event.is_set():
+                try:
+                    conn, _ = server.accept()
+                except TimeoutError:
+                    continue
+                except OSError:
+                    if self._stop_event.is_set():
+                        break
+                    continue
+                with conn:
+                    payload = b""
+                    while True:
+                        chunk = conn.recv(4096)
+                        if not chunk:
+                            break
+                        payload += chunk
+                    command = payload.decode().strip()
+                    response = self._handle_command(command)
+                    try:
+                        conn.sendall(response.encode())
+                    except BrokenPipeError:
+                        continue
+
+    def _handle_command(self, command: str) -> str:
+        if command == "show stat":
+            return self._render_show_stat()
+
+        parts = command.split()
+        if not parts:
+            return "\n"
+
+        if parts[0:2] == ["set", "server"] and len(parts) >= 7:
+            slot_id = self._parse_slot(parts[2])
+            if slot_id is None:
+                return "No such server.\n"
+            with self._lock:
+                slot_state = self._state[slot_id]
+                slot_state["addr"] = parts[4]
+                slot_state["port"] = int(parts[6])
+                slot_state["status"] = "UP" if slot_state["enabled"] else "DOWN"
+            return "\n"
+
+        if parts[0:2] == ["enable", "server"] and len(parts) >= 3:
+            slot_id = self._parse_slot(parts[2])
+            if slot_id is None:
+                return "No such server.\n"
+            with self._lock:
+                slot_state = self._state[slot_id]
+                slot_state["enabled"] = True
+                slot_state["status"] = "UP"
+            return "\n"
+
+        if parts[0:2] == ["disable", "server"] and len(parts) >= 3:
+            slot_id = self._parse_slot(parts[2])
+            if slot_id is None:
+                return "No such server.\n"
+            with self._lock:
+                slot_state = self._state[slot_id]
+                slot_state["enabled"] = False
+                slot_state["status"] = "MAINT"
+            return "\n"
+
+        return "Unknown command.\n"
+
+    def _parse_slot(self, backend_slot: str) -> str | None:
+        backend, _, slot_id = backend_slot.partition("/")
+        if backend != self._backend or slot_id not in self._state:
+            return None
+        return slot_id
+
+    def _render_show_stat(self) -> str:
+        header = "# pxname,svname,qcur,qmax,scur,smax,slim,stot,status\n"
+        rows = [f"{self._backend},BACKEND,0,0,0,0,0,0,UP\n"]
+        with self._lock:
+            for slot_id in self._slot_ids:
+                slot_state = self._state[slot_id]
+                rows.append(
+                    f"{self._backend},{slot_id},{slot_state['qcur']},0,"
+                    f"{slot_state['scur']},0,50,0,{slot_state['status']}\n"
+                )
+        return header + "".join(rows)
+
+
+class DaemonHarness:
+    """In-process threaded harness for scheduler/reconciler/API integration."""
+
+    def __init__(
+        self,
+        root: Path,
+        *,
+        db_path: Path | None = None,
+        runtime: FakeRuntime | None = None,
+        max_slots: int = 3,
+        min_slots: int = 0,
+        idle_scale_down_seconds: int = 1,
+        drain_timeout_seconds: int = 120,
+    ) -> None:
+        root.mkdir(parents=True, exist_ok=True)
+        self.clock = FakeClock()
+        self.metrics = MetricsRegistry()
+        self.runtime = runtime or FakeRuntime(launch_latency_ticks=2, ip_delay_ticks=1)
+        self._stop_event = threading.Event()
+        self._threads: list[threading.Thread] = []
+        self._reconcile_lock = threading.Lock()
+
+        self._db_path = db_path or (root / "state.db")
+        self._socket_path = root / "haproxy.sock"
+        self._slot_ids = [f"slot{i:03d}" for i in range(1, 4)]
+
+        self.config = AppConfig(
+            aws=AwsConfig(region="us-east-1"),
+            haproxy=HaproxyConfig(
+                runtime_socket=str(self._socket_path),
+                backend="all",
+                slot_prefix="slot",
+                slot_count=3,
+                check_ready_up_count=1,
+            ),
+            capacity=CapacityConfig(
+                default_system="x86_64-linux",
+                max_slots=max_slots,
+                min_slots=min_slots,
+                max_leases_per_slot=1,
+                target_warm_slots=0,
+                reservation_ttl_seconds=1200,
+                idle_scale_down_seconds=idle_scale_down_seconds,
+                drain_timeout_seconds=drain_timeout_seconds,
+            ),
+            scheduler=SchedulerConfig(tick_seconds=0.05, reconcile_seconds=0.05),
+        )
+
+        self.db = StateDB(str(self._db_path), clock=self.clock)
+        self.db.init_schema()
+        self.db.init_slots("slot", 3, "x86_64-linux", "all")
+
+        self.haproxy_server = FakeHAProxySocketServer(self._socket_path, "all", self._slot_ids)
+        self.haproxy = HAProxyRuntime(str(self._socket_path), "all", "slot")
+        self.reconciler = Reconciler(
+            self.db,
+            self.runtime,
+            self.haproxy,
+            self.config,
+            self.clock,
+            self.metrics,
+        )
+
+        app = create_app(
+            self.db,
+            self.config,
+            self.clock,
+            self.metrics,
+            reconcile_now=self.reconcile_now,
+        )
+        self.client = TestClient(app)
+
+    def start(self) -> None:
+        self.haproxy_server.start()
+        with self._reconcile_lock:
+            self.runtime.tick()
+            self.reconciler.tick()
+        self._threads = [
+            threading.Thread(target=self._scheduler_loop, name="sched", daemon=True),
+            threading.Thread(target=self._reconciler_loop, name="recon", daemon=True),
+        ]
+        for thread in self._threads:
+            thread.start()
+
+    def stop(self) -> None:
+        self._stop_event.set()
+        for thread in self._threads:
+            thread.join(timeout=2.0)
+        self.client.close()
+        self.haproxy_server.stop()
+        self.db.close()
+
+    def create_reservation(self, reason: str) -> str:
+        response = self.client.post(
+            "/v1/reservations",
+            json={"system": "x86_64-linux", "reason": reason},
+        )
+        assert response.status_code == 200
+        return str(response.json()["reservation_id"])
+
+    def release_reservation(self, reservation_id: str) -> None:
+        response = self.client.post(f"/v1/reservations/{reservation_id}/release")
+        assert response.status_code == 200
+
+    def reservation(self, reservation_id: str) -> dict:
+        response = self.client.get(f"/v1/reservations/{reservation_id}")
+        assert response.status_code == 200
+        return response.json()
+
+    def wait_for(self, predicate, timeout: float = 6.0) -> None:  # noqa: ANN001
+        deadline = time.time() + timeout
+        while time.time() < deadline:
+            if predicate():
+                return
+            time.sleep(0.02)
+        raise AssertionError("condition not met before timeout")
+
+    def reconcile_now(self) -> dict[str, bool]:
+        with self._reconcile_lock:
+            self.runtime.tick()
+            self.reconciler.tick()
+        return {"triggered": True}
+
+    def _scheduler_loop(self) -> None:
+        while not self._stop_event.is_set():
+            scheduling_tick(self.db, self.runtime, self.config, self.clock, self.metrics)
+            self._stop_event.wait(self.config.scheduler.tick_seconds)
+
+    def _reconciler_loop(self) -> None:
+        while not self._stop_event.is_set():
+            with self._reconcile_lock:
+                self.runtime.tick()
+                self.reconciler.tick()
+            self._stop_event.wait(self.config.scheduler.reconcile_seconds)
+
+
+def test_cold_start_reservation_launch_bind_ready(tmp_path: Path) -> None:
+    harness = DaemonHarness(tmp_path)
+    harness.start()
+    try:
+        reservation_id = harness.create_reservation("cold-start")
+        harness.wait_for(lambda: harness.reservation(reservation_id)["phase"] == "ready")
+        reservation = harness.reservation(reservation_id)
+        assert reservation["slot"] is not None
+        slot = harness.db.get_slot(reservation["slot"])
+        assert slot is not None
+        assert slot["state"] == SlotState.READY.value
+        assert slot["instance_ip"] is not None
+    finally:
+        harness.stop()
+
+
+def test_burst_three_concurrent_reservations(tmp_path: Path) -> None:
+    harness = DaemonHarness(tmp_path, max_slots=3)
+    harness.start()
+    try:
+        reservation_ids = [harness.create_reservation(f"burst-{i}") for i in range(3)]
+        harness.wait_for(
+            lambda: all(harness.reservation(rid)["phase"] == "ready" for rid in reservation_ids),
+            timeout=8.0,
+        )
+        slots = [harness.reservation(rid)["slot"] for rid in reservation_ids]
+        assert len(set(slots)) == 3
+    finally:
+        harness.stop()
+
+
+def test_scale_down_after_release_and_idle_timeout(tmp_path: Path) -> None:
+    harness = DaemonHarness(tmp_path, idle_scale_down_seconds=1, drain_timeout_seconds=0)
+    harness.start()
+    try:
+        reservation_id = harness.create_reservation("scale-down")
+        harness.wait_for(lambda: harness.reservation(reservation_id)["phase"] == "ready")
+        slot_id = str(harness.reservation(reservation_id)["slot"])
+
+        harness.release_reservation(reservation_id)
+        harness.clock.advance(2)
+        harness.wait_for(
+            lambda: (
+                harness.db.get_slot(slot_id) is not None
+                and harness.db.get_slot(slot_id)["state"] == SlotState.EMPTY.value
+            )
+        )
+    finally:
+        harness.stop()
+
+
+def test_restart_recovery_midflight(tmp_path: Path) -> None:
+    db_path = tmp_path / "state.db"
+    runtime = FakeRuntime(launch_latency_ticks=6, ip_delay_ticks=2)
+
+    first = DaemonHarness(tmp_path / "run1", db_path=db_path, runtime=runtime)
+    first.start()
+    reservation_id = first.create_reservation("restart-midflight")
+    first.wait_for(
+        lambda: len(first.db.list_slots(SlotState.LAUNCHING)) > 0,
+        timeout=4.0,
+    )
+    first.stop()
+
+    second = DaemonHarness(tmp_path / "run2", db_path=db_path, runtime=runtime)
+    second.start()
+    try:
+        second.wait_for(lambda: second.reservation(reservation_id)["phase"] == "ready", timeout=8.0)
+    finally:
+        second.stop()
+
+
+def test_interruption_recovery_pending_reservation_resolves(tmp_path: Path) -> None:
+    harness = DaemonHarness(tmp_path, max_slots=2, idle_scale_down_seconds=60)
+    harness.start()
+    try:
+        first_reservation = harness.create_reservation("baseline")
+        harness.wait_for(lambda: harness.reservation(first_reservation)["phase"] == "ready")
+        slot_id = str(harness.reservation(first_reservation)["slot"])
+        instance_id = str(harness.reservation(first_reservation)["instance_id"])
+
+        second_reservation = harness.create_reservation("post-interruption")
+        harness.release_reservation(first_reservation)
+
+        harness.runtime.inject_interruption(instance_id)
+        harness.runtime._instances[instance_id].state = "shutting-down"
+
+        harness.wait_for(
+            lambda: (
+                harness.db.get_slot(slot_id) is not None
+                and harness.db.get_slot(slot_id)["state"]
+                in {
+                    SlotState.DRAINING.value,
+                    SlotState.TERMINATING.value,
+                    SlotState.EMPTY.value,
+                }
+            ),
+            timeout=6.0,
+        )
+        harness.wait_for(
+            lambda: harness.reservation(second_reservation)["phase"] == "ready",
+            timeout=10.0,
+        )
+    finally:
+        harness.stop()