Fix booting slots stuck due when cooldown blocks termination
This commit is contained in:
parent
f0fd0f342e
commit
95021a4253
2 changed files with 43 additions and 2 deletions
|
|
@ -328,10 +328,28 @@ class Reconciler:
|
||||||
return (now - last_change).total_seconds()
|
return (now - last_change).total_seconds()
|
||||||
|
|
||||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||||
if not self._can_start_termination():
|
|
||||||
return
|
|
||||||
slot_id = slot["slot_id"]
|
slot_id = slot["slot_id"]
|
||||||
instance_id = slot.get("instance_id")
|
instance_id = slot.get("instance_id")
|
||||||
|
ec2_state = None
|
||||||
|
if extra is not None:
|
||||||
|
ec2_state = extra.get("ec2_state")
|
||||||
|
|
||||||
|
if not self._can_start_termination():
|
||||||
|
# If cooldown is active but EC2 is already terminal/stopped, do not
|
||||||
|
# keep the slot stuck in its current state.
|
||||||
|
if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES:
|
||||||
|
self._db.update_slot_state(
|
||||||
|
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||||
|
)
|
||||||
|
payload = {"slot_id": slot_id}
|
||||||
|
if instance_id:
|
||||||
|
payload["instance_id"] = instance_id
|
||||||
|
if extra:
|
||||||
|
payload.update(extra)
|
||||||
|
log.warning(reason, extra=payload)
|
||||||
|
log.info("slot_emptied", extra={"slot_id": slot_id})
|
||||||
|
return
|
||||||
|
|
||||||
started_terminating = False
|
started_terminating = False
|
||||||
if instance_id:
|
if instance_id:
|
||||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||||
|
|
|
||||||
|
|
@ -228,3 +228,26 @@ def test_termination_cooldown_spaces_terminations() -> None:
|
||||||
assert slot is not None
|
assert slot is not None
|
||||||
assert slot["state"] == SlotState.TERMINATING.value
|
assert slot["state"] == SlotState.TERMINATING.value
|
||||||
assert runtime.terminate_calls == ["i-6", "i-7"]
|
assert runtime.terminate_calls == ["i-6", "i-7"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_booting_terminal_state_bypasses_termination_cooldown() -> None:
|
||||||
|
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
|
||||||
|
|
||||||
|
# Start cooldown by triggering one termination.
|
||||||
|
runtime.instances["i-8"] = _Instance(state="running", slot_id="slot001")
|
||||||
|
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-8", lease_count=0)
|
||||||
|
reconciler.tick()
|
||||||
|
assert runtime.terminate_calls == ["i-8"]
|
||||||
|
|
||||||
|
# During cooldown, a booting slot whose instance is already terminated
|
||||||
|
# should be emptied immediately without waiting for cooldown expiry.
|
||||||
|
runtime.instances["i-9"] = _Instance(state="terminated", slot_id="slot001")
|
||||||
|
db.update_slot_state("slot001", SlotState.BOOTING, instance_id="i-9")
|
||||||
|
clock.advance(1)
|
||||||
|
reconciler.tick()
|
||||||
|
|
||||||
|
slot = db.get_slot("slot001")
|
||||||
|
assert slot is not None
|
||||||
|
assert slot["state"] == SlotState.EMPTY.value
|
||||||
|
assert slot["instance_id"] is None
|
||||||
|
assert runtime.terminate_calls == ["i-8"]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue