add termination cooldown for slot scale-down
All checks were successful
buildbot/nix-eval Build done.
buildbot/nix-build Build done.
buildbot/nix-effects Build done.

This commit is contained in:
Abel Luck 2026-02-27 18:37:58 +01:00
parent e1dbd5c119
commit 44bc99ab85
7 changed files with 72 additions and 1 deletions

View file

@ -197,6 +197,7 @@ def create_app(
boot_timeout_seconds=config.capacity.boot_timeout_seconds, boot_timeout_seconds=config.capacity.boot_timeout_seconds,
binding_timeout_seconds=config.capacity.binding_timeout_seconds, binding_timeout_seconds=config.capacity.binding_timeout_seconds,
terminating_timeout_seconds=config.capacity.terminating_timeout_seconds, terminating_timeout_seconds=config.capacity.terminating_timeout_seconds,
termination_cooldown_seconds=config.capacity.termination_cooldown_seconds,
), ),
scheduler=SchedulerPolicy( scheduler=SchedulerPolicy(
tick_seconds=config.scheduler.tick_seconds, tick_seconds=config.scheduler.tick_seconds,

View file

@ -50,6 +50,7 @@ class SystemConfig:
max_leases_per_slot: int = 1 max_leases_per_slot: int = 1
launch_batch_size: int = 1 launch_batch_size: int = 1
scale_down_idle_seconds: int = 900 scale_down_idle_seconds: int = 900
termination_cooldown_seconds: int = 180
@dataclass @dataclass
@ -68,6 +69,7 @@ class CapacityConfig:
boot_timeout_seconds: int = 300 boot_timeout_seconds: int = 300
binding_timeout_seconds: int = 180 binding_timeout_seconds: int = 180
terminating_timeout_seconds: int = 300 terminating_timeout_seconds: int = 300
termination_cooldown_seconds: int = 180
@dataclass @dataclass

View file

@ -144,6 +144,7 @@ class CapacityPolicy(BaseModel):
boot_timeout_seconds: int boot_timeout_seconds: int
binding_timeout_seconds: int binding_timeout_seconds: int
terminating_timeout_seconds: int terminating_timeout_seconds: int
termination_cooldown_seconds: int
class SchedulerPolicy(BaseModel): class SchedulerPolicy(BaseModel):

View file

@ -10,7 +10,7 @@ from __future__ import annotations
import contextlib import contextlib
import logging import logging
import time import time
from datetime import datetime from datetime import datetime, timedelta
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from .models import SlotState from .models import SlotState
@ -50,6 +50,7 @@ class Reconciler:
self._clock = clock self._clock = clock
self._metrics = metrics self._metrics = metrics
self._binding_up_counts: dict[str, int] = {} self._binding_up_counts: dict[str, int] = {}
self._termination_cooldown_until: datetime | None = None
def tick(self) -> None: def tick(self) -> None:
"""Execute one reconciliation tick.""" """Execute one reconciliation tick."""
@ -255,6 +256,8 @@ class Reconciler:
drain_timeout = self._config.capacity.drain_timeout_seconds drain_timeout = self._config.capacity.drain_timeout_seconds
if slot["lease_count"] == 0 or drain_duration >= drain_timeout: if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
if not self._can_start_termination():
return
instance_id = slot.get("instance_id") instance_id = slot.get("instance_id")
if instance_id: if instance_id:
try: try:
@ -276,6 +279,7 @@ class Reconciler:
exc_info=True, exc_info=True,
) )
self._db.update_slot_state(slot_id, SlotState.TERMINATING) self._db.update_slot_state(slot_id, SlotState.TERMINATING)
self._mark_termination_started()
log.info( log.info(
"slot_terminating", "slot_terminating",
extra={"slot_id": slot_id, "drain_duration": drain_duration}, extra={"slot_id": slot_id, "drain_duration": drain_duration},
@ -324,11 +328,15 @@ class Reconciler:
return (now - last_change).total_seconds() return (now - last_change).total_seconds()
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None: def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
if not self._can_start_termination():
return
slot_id = slot["slot_id"] slot_id = slot["slot_id"]
instance_id = slot.get("instance_id") instance_id = slot.get("instance_id")
started_terminating = False
if instance_id: if instance_id:
self._terminate_instance_best_effort(slot_id, instance_id) self._terminate_instance_best_effort(slot_id, instance_id)
self._db.update_slot_state(slot_id, SlotState.TERMINATING) self._db.update_slot_state(slot_id, SlotState.TERMINATING)
started_terminating = True
else: else:
self._db.update_slot_state( self._db.update_slot_state(
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0 slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
@ -340,6 +348,23 @@ class Reconciler:
if extra: if extra:
payload.update(extra) payload.update(extra)
log.warning(reason, extra=payload) log.warning(reason, extra=payload)
if started_terminating:
self._mark_termination_started()
def _can_start_termination(self) -> bool:
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
if cooldown_seconds <= 0:
return True
if self._termination_cooldown_until is None:
return True
return self._clock.now() >= self._termination_cooldown_until
def _mark_termination_started(self) -> None:
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
if cooldown_seconds <= 0:
self._termination_cooldown_until = None
return
self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds)
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None: def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
try: try:

View file

@ -71,6 +71,7 @@ def _make_env(
boot_timeout=300, boot_timeout=300,
binding_timeout=180, binding_timeout=180,
terminating_timeout=300, terminating_timeout=300,
termination_cooldown=0,
): ):
clock = FakeClock() clock = FakeClock()
db = StateDB(":memory:", clock=clock) db = StateDB(":memory:", clock=clock)
@ -85,6 +86,7 @@ def _make_env(
boot_timeout_seconds=boot_timeout, boot_timeout_seconds=boot_timeout,
binding_timeout_seconds=binding_timeout, binding_timeout_seconds=binding_timeout,
terminating_timeout_seconds=terminating_timeout, terminating_timeout_seconds=terminating_timeout,
termination_cooldown_seconds=termination_cooldown,
), ),
aws=AwsConfig(region="us-east-1"), aws=AwsConfig(region="us-east-1"),
) )
@ -195,3 +197,34 @@ def test_terminating_timeout_reissues_terminate_with_pacing() -> None:
# Immediate next tick should not retry yet because last_state_change was refreshed. # Immediate next tick should not retry yet because last_state_change was refreshed.
reconciler.tick() reconciler.tick()
assert runtime.terminate_calls == ["i-5"] assert runtime.terminate_calls == ["i-5"]
def test_termination_cooldown_spaces_terminations() -> None:
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
runtime.instances["i-6"] = _Instance(state="running", slot_id="slot001")
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-6", lease_count=0)
reconciler.tick()
slot = db.get_slot("slot001")
assert slot is not None
assert slot["state"] == SlotState.TERMINATING.value
assert runtime.terminate_calls == ["i-6"]
# New draining cycle before cooldown expires should not terminate yet.
runtime.instances["i-7"] = _Instance(state="running", slot_id="slot001")
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-7", lease_count=0)
clock.advance(10)
reconciler.tick()
slot = db.get_slot("slot001")
assert slot is not None
assert slot["state"] == SlotState.DRAINING.value
assert runtime.terminate_calls == ["i-6"]
# After cooldown, termination proceeds.
clock.advance(21)
reconciler.tick()
slot = db.get_slot("slot001")
assert slot is not None
assert slot["state"] == SlotState.TERMINATING.value
assert runtime.terminate_calls == ["i-6", "i-7"]

View file

@ -118,6 +118,7 @@ def test_effective_config_returns_capacity_and_scheduler() -> None:
body = response.json() body = response.json()
assert body["capacity"]["max_slots"] == 8 assert body["capacity"]["max_slots"] == 8
assert body["capacity"]["idle_scale_down_seconds"] == 900 assert body["capacity"]["idle_scale_down_seconds"] == 900
assert body["capacity"]["termination_cooldown_seconds"] == 180
assert body["scheduler"]["tick_seconds"] == 3.0 assert body["scheduler"]["tick_seconds"] == 3.0
assert body["scheduler"]["reconcile_seconds"] == 15.0 assert body["scheduler"]["reconcile_seconds"] == 15.0

View file

@ -205,6 +205,12 @@ in
description = "Max seconds between terminate retries while slot is terminating."; description = "Max seconds between terminate retries while slot is terminating.";
}; };
terminationCooldownSeconds = lib.mkOption {
type = lib.types.int;
default = 180;
description = "Minimum cooldown in seconds between starting slot terminations.";
};
launchBatchSize = lib.mkOption { launchBatchSize = lib.mkOption {
type = lib.types.int; type = lib.types.int;
default = 1; default = 1;
@ -329,6 +335,7 @@ in
boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds} boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds}
binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds} binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds}
terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds} terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds}
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
[security] [security]
socket_mode = "${cfg.security.socketMode}" socket_mode = "${cfg.security.socketMode}"
@ -343,6 +350,7 @@ in
max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot} max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot}
launch_batch_size = ${toString cfg.capacity.launchBatchSize} launch_batch_size = ${toString cfg.capacity.launchBatchSize}
scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds} scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds}
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
EOF EOF
chown ${cfg.user}:${cfg.group} ${generatedConfigPath} chown ${cfg.user}:${cfg.group} ${generatedConfigPath}