Fix booting slots stuck due when cooldown blocks termination
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-03-05 13:10:10 +01:00
parent f0fd0f342e
commit 95021a4253
2 changed files with 43 additions and 2 deletions

View file

@ -328,10 +328,28 @@ class Reconciler:
return (now - last_change).total_seconds() return (now - last_change).total_seconds()
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None: def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
if not self._can_start_termination():
return
slot_id = slot["slot_id"] slot_id = slot["slot_id"]
instance_id = slot.get("instance_id") instance_id = slot.get("instance_id")
ec2_state = None
if extra is not None:
ec2_state = extra.get("ec2_state")
if not self._can_start_termination():
# If cooldown is active but EC2 is already terminal/stopped, do not
# keep the slot stuck in its current state.
if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES:
self._db.update_slot_state(
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
)
payload = {"slot_id": slot_id}
if instance_id:
payload["instance_id"] = instance_id
if extra:
payload.update(extra)
log.warning(reason, extra=payload)
log.info("slot_emptied", extra={"slot_id": slot_id})
return
started_terminating = False started_terminating = False
if instance_id: if instance_id:
self._terminate_instance_best_effort(slot_id, instance_id) self._terminate_instance_best_effort(slot_id, instance_id)

View file

@ -228,3 +228,26 @@ def test_termination_cooldown_spaces_terminations() -> None:
assert slot is not None assert slot is not None
assert slot["state"] == SlotState.TERMINATING.value assert slot["state"] == SlotState.TERMINATING.value
assert runtime.terminate_calls == ["i-6", "i-7"] assert runtime.terminate_calls == ["i-6", "i-7"]
def test_booting_terminal_state_bypasses_termination_cooldown() -> None:
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
# Start cooldown by triggering one termination.
runtime.instances["i-8"] = _Instance(state="running", slot_id="slot001")
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-8", lease_count=0)
reconciler.tick()
assert runtime.terminate_calls == ["i-8"]
# During cooldown, a booting slot whose instance is already terminated
# should be emptied immediately without waiting for cooldown expiry.
runtime.instances["i-9"] = _Instance(state="terminated", slot_id="slot001")
db.update_slot_state("slot001", SlotState.BOOTING, instance_id="i-9")
clock.advance(1)
reconciler.tick()
slot = db.get_slot("slot001")
assert slot is not None
assert slot["state"] == SlotState.EMPTY.value
assert slot["instance_id"] is None
assert runtime.terminate_calls == ["i-8"]