Fix booting slots stuck due when cooldown blocks termination
This commit is contained in:
parent
f0fd0f342e
commit
95021a4253
2 changed files with 43 additions and 2 deletions
|
|
@ -328,10 +328,28 @@ class Reconciler:
|
|||
return (now - last_change).total_seconds()
|
||||
|
||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
slot_id = slot["slot_id"]
|
||||
instance_id = slot.get("instance_id")
|
||||
ec2_state = None
|
||||
if extra is not None:
|
||||
ec2_state = extra.get("ec2_state")
|
||||
|
||||
if not self._can_start_termination():
|
||||
# If cooldown is active but EC2 is already terminal/stopped, do not
|
||||
# keep the slot stuck in its current state.
|
||||
if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES:
|
||||
self._db.update_slot_state(
|
||||
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||
)
|
||||
payload = {"slot_id": slot_id}
|
||||
if instance_id:
|
||||
payload["instance_id"] = instance_id
|
||||
if extra:
|
||||
payload.update(extra)
|
||||
log.warning(reason, extra=payload)
|
||||
log.info("slot_emptied", extra={"slot_id": slot_id})
|
||||
return
|
||||
|
||||
started_terminating = False
|
||||
if instance_id:
|
||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||
|
|
|
|||
|
|
@ -228,3 +228,26 @@ def test_termination_cooldown_spaces_terminations() -> None:
|
|||
assert slot is not None
|
||||
assert slot["state"] == SlotState.TERMINATING.value
|
||||
assert runtime.terminate_calls == ["i-6", "i-7"]
|
||||
|
||||
|
||||
def test_booting_terminal_state_bypasses_termination_cooldown() -> None:
|
||||
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
|
||||
|
||||
# Start cooldown by triggering one termination.
|
||||
runtime.instances["i-8"] = _Instance(state="running", slot_id="slot001")
|
||||
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-8", lease_count=0)
|
||||
reconciler.tick()
|
||||
assert runtime.terminate_calls == ["i-8"]
|
||||
|
||||
# During cooldown, a booting slot whose instance is already terminated
|
||||
# should be emptied immediately without waiting for cooldown expiry.
|
||||
runtime.instances["i-9"] = _Instance(state="terminated", slot_id="slot001")
|
||||
db.update_slot_state("slot001", SlotState.BOOTING, instance_id="i-9")
|
||||
clock.advance(1)
|
||||
reconciler.tick()
|
||||
|
||||
slot = db.get_slot("slot001")
|
||||
assert slot is not None
|
||||
assert slot["state"] == SlotState.EMPTY.value
|
||||
assert slot["instance_id"] is None
|
||||
assert runtime.terminate_calls == ["i-8"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue