add termination cooldown for slot scale-down
This commit is contained in:
parent
e1dbd5c119
commit
44bc99ab85
7 changed files with 72 additions and 1 deletions
|
|
@ -197,6 +197,7 @@ def create_app(
|
|||
boot_timeout_seconds=config.capacity.boot_timeout_seconds,
|
||||
binding_timeout_seconds=config.capacity.binding_timeout_seconds,
|
||||
terminating_timeout_seconds=config.capacity.terminating_timeout_seconds,
|
||||
termination_cooldown_seconds=config.capacity.termination_cooldown_seconds,
|
||||
),
|
||||
scheduler=SchedulerPolicy(
|
||||
tick_seconds=config.scheduler.tick_seconds,
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ class SystemConfig:
|
|||
max_leases_per_slot: int = 1
|
||||
launch_batch_size: int = 1
|
||||
scale_down_idle_seconds: int = 900
|
||||
termination_cooldown_seconds: int = 180
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -68,6 +69,7 @@ class CapacityConfig:
|
|||
boot_timeout_seconds: int = 300
|
||||
binding_timeout_seconds: int = 180
|
||||
terminating_timeout_seconds: int = 300
|
||||
termination_cooldown_seconds: int = 180
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -144,6 +144,7 @@ class CapacityPolicy(BaseModel):
|
|||
boot_timeout_seconds: int
|
||||
binding_timeout_seconds: int
|
||||
terminating_timeout_seconds: int
|
||||
termination_cooldown_seconds: int
|
||||
|
||||
|
||||
class SchedulerPolicy(BaseModel):
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
|||
import contextlib
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from .models import SlotState
|
||||
|
|
@ -50,6 +50,7 @@ class Reconciler:
|
|||
self._clock = clock
|
||||
self._metrics = metrics
|
||||
self._binding_up_counts: dict[str, int] = {}
|
||||
self._termination_cooldown_until: datetime | None = None
|
||||
|
||||
def tick(self) -> None:
|
||||
"""Execute one reconciliation tick."""
|
||||
|
|
@ -255,6 +256,8 @@ class Reconciler:
|
|||
|
||||
drain_timeout = self._config.capacity.drain_timeout_seconds
|
||||
if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
instance_id = slot.get("instance_id")
|
||||
if instance_id:
|
||||
try:
|
||||
|
|
@ -276,6 +279,7 @@ class Reconciler:
|
|||
exc_info=True,
|
||||
)
|
||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||
self._mark_termination_started()
|
||||
log.info(
|
||||
"slot_terminating",
|
||||
extra={"slot_id": slot_id, "drain_duration": drain_duration},
|
||||
|
|
@ -324,11 +328,15 @@ class Reconciler:
|
|||
return (now - last_change).total_seconds()
|
||||
|
||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||
if not self._can_start_termination():
|
||||
return
|
||||
slot_id = slot["slot_id"]
|
||||
instance_id = slot.get("instance_id")
|
||||
started_terminating = False
|
||||
if instance_id:
|
||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||
started_terminating = True
|
||||
else:
|
||||
self._db.update_slot_state(
|
||||
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||
|
|
@ -340,6 +348,23 @@ class Reconciler:
|
|||
if extra:
|
||||
payload.update(extra)
|
||||
log.warning(reason, extra=payload)
|
||||
if started_terminating:
|
||||
self._mark_termination_started()
|
||||
|
||||
def _can_start_termination(self) -> bool:
|
||||
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||
if cooldown_seconds <= 0:
|
||||
return True
|
||||
if self._termination_cooldown_until is None:
|
||||
return True
|
||||
return self._clock.now() >= self._termination_cooldown_until
|
||||
|
||||
def _mark_termination_started(self) -> None:
|
||||
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||
if cooldown_seconds <= 0:
|
||||
self._termination_cooldown_until = None
|
||||
return
|
||||
self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds)
|
||||
|
||||
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ def _make_env(
|
|||
boot_timeout=300,
|
||||
binding_timeout=180,
|
||||
terminating_timeout=300,
|
||||
termination_cooldown=0,
|
||||
):
|
||||
clock = FakeClock()
|
||||
db = StateDB(":memory:", clock=clock)
|
||||
|
|
@ -85,6 +86,7 @@ def _make_env(
|
|||
boot_timeout_seconds=boot_timeout,
|
||||
binding_timeout_seconds=binding_timeout,
|
||||
terminating_timeout_seconds=terminating_timeout,
|
||||
termination_cooldown_seconds=termination_cooldown,
|
||||
),
|
||||
aws=AwsConfig(region="us-east-1"),
|
||||
)
|
||||
|
|
@ -195,3 +197,34 @@ def test_terminating_timeout_reissues_terminate_with_pacing() -> None:
|
|||
# Immediate next tick should not retry yet because last_state_change was refreshed.
|
||||
reconciler.tick()
|
||||
assert runtime.terminate_calls == ["i-5"]
|
||||
|
||||
|
||||
def test_termination_cooldown_spaces_terminations() -> None:
|
||||
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
|
||||
runtime.instances["i-6"] = _Instance(state="running", slot_id="slot001")
|
||||
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-6", lease_count=0)
|
||||
|
||||
reconciler.tick()
|
||||
slot = db.get_slot("slot001")
|
||||
assert slot is not None
|
||||
assert slot["state"] == SlotState.TERMINATING.value
|
||||
assert runtime.terminate_calls == ["i-6"]
|
||||
|
||||
# New draining cycle before cooldown expires should not terminate yet.
|
||||
runtime.instances["i-7"] = _Instance(state="running", slot_id="slot001")
|
||||
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-7", lease_count=0)
|
||||
clock.advance(10)
|
||||
reconciler.tick()
|
||||
|
||||
slot = db.get_slot("slot001")
|
||||
assert slot is not None
|
||||
assert slot["state"] == SlotState.DRAINING.value
|
||||
assert runtime.terminate_calls == ["i-6"]
|
||||
|
||||
# After cooldown, termination proceeds.
|
||||
clock.advance(21)
|
||||
reconciler.tick()
|
||||
slot = db.get_slot("slot001")
|
||||
assert slot is not None
|
||||
assert slot["state"] == SlotState.TERMINATING.value
|
||||
assert runtime.terminate_calls == ["i-6", "i-7"]
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ def test_effective_config_returns_capacity_and_scheduler() -> None:
|
|||
body = response.json()
|
||||
assert body["capacity"]["max_slots"] == 8
|
||||
assert body["capacity"]["idle_scale_down_seconds"] == 900
|
||||
assert body["capacity"]["termination_cooldown_seconds"] == 180
|
||||
assert body["scheduler"]["tick_seconds"] == 3.0
|
||||
assert body["scheduler"]["reconcile_seconds"] == 15.0
|
||||
|
||||
|
|
|
|||
|
|
@ -205,6 +205,12 @@ in
|
|||
description = "Max seconds between terminate retries while slot is terminating.";
|
||||
};
|
||||
|
||||
terminationCooldownSeconds = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 180;
|
||||
description = "Minimum cooldown in seconds between starting slot terminations.";
|
||||
};
|
||||
|
||||
launchBatchSize = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 1;
|
||||
|
|
@ -329,6 +335,7 @@ in
|
|||
boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds}
|
||||
binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds}
|
||||
terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds}
|
||||
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
|
||||
|
||||
[security]
|
||||
socket_mode = "${cfg.security.socketMode}"
|
||||
|
|
@ -343,6 +350,7 @@ in
|
|||
max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot}
|
||||
launch_batch_size = ${toString cfg.capacity.launchBatchSize}
|
||||
scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds}
|
||||
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
|
||||
EOF
|
||||
|
||||
chown ${cfg.user}:${cfg.group} ${generatedConfigPath}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue