add termination cooldown for slot scale-down
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-02-27 18:37:58 +01:00
parent e1dbd5c119
commit 44bc99ab85
7 changed files with 72 additions and 1 deletions

View file

@ -10,7 +10,7 @@ from __future__ import annotations
import contextlib
import logging
import time
from datetime import datetime
from datetime import datetime, timedelta
from typing import TYPE_CHECKING
from .models import SlotState
@ -50,6 +50,7 @@ class Reconciler:
self._clock = clock
self._metrics = metrics
self._binding_up_counts: dict[str, int] = {}
self._termination_cooldown_until: datetime | None = None
def tick(self) -> None:
"""Execute one reconciliation tick."""
@ -255,6 +256,8 @@ class Reconciler:
drain_timeout = self._config.capacity.drain_timeout_seconds
if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
if not self._can_start_termination():
return
instance_id = slot.get("instance_id")
if instance_id:
try:
@ -276,6 +279,7 @@ class Reconciler:
exc_info=True,
)
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
self._mark_termination_started()
log.info(
"slot_terminating",
extra={"slot_id": slot_id, "drain_duration": drain_duration},
@ -324,11 +328,15 @@ class Reconciler:
return (now - last_change).total_seconds()
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
if not self._can_start_termination():
return
slot_id = slot["slot_id"]
instance_id = slot.get("instance_id")
started_terminating = False
if instance_id:
self._terminate_instance_best_effort(slot_id, instance_id)
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
started_terminating = True
else:
self._db.update_slot_state(
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
@ -340,6 +348,23 @@ class Reconciler:
if extra:
payload.update(extra)
log.warning(reason, extra=payload)
if started_terminating:
self._mark_termination_started()
def _can_start_termination(self) -> bool:
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
if cooldown_seconds <= 0:
return True
if self._termination_cooldown_until is None:
return True
return self._clock.now() >= self._termination_cooldown_until
def _mark_termination_started(self) -> None:
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
if cooldown_seconds <= 0:
self._termination_cooldown_until = None
return
self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds)
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
try: