add termination cooldown for slot scale-down
This commit is contained in:
parent
e1dbd5c119
commit
44bc99ab85
7 changed files with 72 additions and 1 deletions
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
|||
import contextlib
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from .models import SlotState
|
||||
|
|
@ -50,6 +50,7 @@ class Reconciler:
|
|||
self._clock = clock
|
||||
self._metrics = metrics
|
||||
self._binding_up_counts: dict[str, int] = {}
|
||||
self._termination_cooldown_until: datetime | None = None
|
||||
|
||||
def tick(self) -> None:
|
||||
"""Execute one reconciliation tick."""
|
||||
|
|
@ -255,6 +256,8 @@ class Reconciler:
|
|||
|
||||
drain_timeout = self._config.capacity.drain_timeout_seconds
|
||||
if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
instance_id = slot.get("instance_id")
|
||||
if instance_id:
|
||||
try:
|
||||
|
|
@ -276,6 +279,7 @@ class Reconciler:
|
|||
exc_info=True,
|
||||
)
|
||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||
self._mark_termination_started()
|
||||
log.info(
|
||||
"slot_terminating",
|
||||
extra={"slot_id": slot_id, "drain_duration": drain_duration},
|
||||
|
|
@ -324,11 +328,15 @@ class Reconciler:
|
|||
return (now - last_change).total_seconds()
|
||||
|
||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
slot_id = slot["slot_id"]
|
||||
instance_id = slot.get("instance_id")
|
||||
started_terminating = False
|
||||
if instance_id:
|
||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||
started_terminating = True
|
||||
else:
|
||||
self._db.update_slot_state(
|
||||
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||
|
|
@ -340,6 +348,23 @@ class Reconciler:
|
|||
if extra:
|
||||
payload.update(extra)
|
||||
log.warning(reason, extra=payload)
|
||||
if started_terminating:
|
||||
self._mark_termination_started()
|
||||
|
||||
def _can_start_termination(self) -> bool:
|
||||
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||
if cooldown_seconds <= 0:
|
||||
return True
|
||||
if self._termination_cooldown_until is None:
|
||||
return True
|
||||
return self._clock.now() >= self._termination_cooldown_until
|
||||
|
||||
def _mark_termination_started(self) -> None:
|
||||
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||
if cooldown_seconds <= 0:
|
||||
self._termination_cooldown_until = None
|
||||
return
|
||||
self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds)
|
||||
|
||||
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue