Fix booting slots stuck due when cooldown blocks termination
This commit is contained in:
parent
f0fd0f342e
commit
95021a4253
2 changed files with 43 additions and 2 deletions
|
|
@ -328,10 +328,28 @@ class Reconciler:
|
|||
return (now - last_change).total_seconds()
|
||||
|
||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
slot_id = slot["slot_id"]
|
||||
instance_id = slot.get("instance_id")
|
||||
ec2_state = None
|
||||
if extra is not None:
|
||||
ec2_state = extra.get("ec2_state")
|
||||
|
||||
if not self._can_start_termination():
|
||||
# If cooldown is active but EC2 is already terminal/stopped, do not
|
||||
# keep the slot stuck in its current state.
|
||||
if instance_id is None or ec2_state in _TERMINAL_OR_STOPPED_STATES:
|
||||
self._db.update_slot_state(
|
||||
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||
)
|
||||
payload = {"slot_id": slot_id}
|
||||
if instance_id:
|
||||
payload["instance_id"] = instance_id
|
||||
if extra:
|
||||
payload.update(extra)
|
||||
log.warning(reason, extra=payload)
|
||||
log.info("slot_emptied", extra={"slot_id": slot_id})
|
||||
return
|
||||
|
||||
started_terminating = False
|
||||
if instance_id:
|
||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue