Fix booting slots stuck due when cooldown blocks termination
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-03-05 13:10:10 +01:00
parent f0fd0f342e
commit 95021a4253
2 changed files with 43 additions and 2 deletions

View file

@ -328,10 +328,28 @@ class Reconciler:
return (now - last_change).total_seconds()
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
if not self._can_start_termination():
return
slot_id = slot["slot_id"]
instance_id = slot.get("instance_id")
ec2_state = None
if extra is not None:
ec2_state = extra.get("ec2_state")
if not self._can_start_termination():
# If cooldown is active but EC2 is already terminal/stopped, do not
# keep the slot stuck in its current state.
if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES:
self._db.update_slot_state(
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
)
payload = {"slot_id": slot_id}
if instance_id:
payload["instance_id"] = instance_id
if extra:
payload.update(extra)
log.warning(reason, extra=payload)
log.info("slot_emptied", extra={"slot_id": slot_id})
return
started_terminating = False
if instance_id:
self._terminate_instance_best_effort(slot_id, instance_id)