From 95021a425318da1664c37efa88981cc1e4bd622b Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Thu, 5 Mar 2026 13:10:10 +0100 Subject: [PATCH] Fix booting slots stuck due when cooldown blocks termination --- agent/nix_builder_autoscaler/reconciler.py | 22 ++++++++++++++++-- .../tests/test_reconciler.py | 23 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/agent/nix_builder_autoscaler/reconciler.py b/agent/nix_builder_autoscaler/reconciler.py index d632f24..046957d 100644 --- a/agent/nix_builder_autoscaler/reconciler.py +++ b/agent/nix_builder_autoscaler/reconciler.py @@ -328,10 +328,28 @@ class Reconciler: return (now - last_change).total_seconds() def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None: - if not self._can_start_termination(): - return slot_id = slot["slot_id"] instance_id = slot.get("instance_id") + ec2_state = None + if extra is not None: + ec2_state = extra.get("ec2_state") + + if not self._can_start_termination(): + # If cooldown is active but EC2 is already terminal/stopped, do not + # keep the slot stuck in its current state. + if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES: + self._db.update_slot_state( + slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0 + ) + payload = {"slot_id": slot_id} + if instance_id: + payload["instance_id"] = instance_id + if extra: + payload.update(extra) + log.warning(reason, extra=payload) + log.info("slot_emptied", extra={"slot_id": slot_id}) + return + started_terminating = False if instance_id: self._terminate_instance_best_effort(slot_id, instance_id) diff --git a/agent/nix_builder_autoscaler/tests/test_reconciler.py b/agent/nix_builder_autoscaler/tests/test_reconciler.py index 257dea7..f755562 100644 --- a/agent/nix_builder_autoscaler/tests/test_reconciler.py +++ b/agent/nix_builder_autoscaler/tests/test_reconciler.py @@ -228,3 +228,26 @@ def test_termination_cooldown_spaces_terminations() -> None: assert slot is not None assert slot["state"] == SlotState.TERMINATING.value assert runtime.terminate_calls == ["i-6", "i-7"] + + +def test_booting_terminal_state_bypasses_termination_cooldown() -> None: + db, runtime, reconciler, clock = _make_env(termination_cooldown=30) + + # Start cooldown by triggering one termination. + runtime.instances["i-8"] = _Instance(state="running", slot_id="slot001") + db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-8", lease_count=0) + reconciler.tick() + assert runtime.terminate_calls == ["i-8"] + + # During cooldown, a booting slot whose instance is already terminated + # should be emptied immediately without waiting for cooldown expiry. + runtime.instances["i-9"] = _Instance(state="terminated", slot_id="slot001") + db.update_slot_state("slot001", SlotState.BOOTING, instance_id="i-9") + clock.advance(1) + reconciler.tick() + + slot = db.get_slot("slot001") + assert slot is not None + assert slot["state"] == SlotState.EMPTY.value + assert slot["instance_id"] is None + assert runtime.terminate_calls == ["i-8"]