add termination cooldown for slot scale-down
This commit is contained in:
parent
e1dbd5c119
commit
44bc99ab85
7 changed files with 72 additions and 1 deletions
|
|
@ -197,6 +197,7 @@ def create_app(
|
||||||
boot_timeout_seconds=config.capacity.boot_timeout_seconds,
|
boot_timeout_seconds=config.capacity.boot_timeout_seconds,
|
||||||
binding_timeout_seconds=config.capacity.binding_timeout_seconds,
|
binding_timeout_seconds=config.capacity.binding_timeout_seconds,
|
||||||
terminating_timeout_seconds=config.capacity.terminating_timeout_seconds,
|
terminating_timeout_seconds=config.capacity.terminating_timeout_seconds,
|
||||||
|
termination_cooldown_seconds=config.capacity.termination_cooldown_seconds,
|
||||||
),
|
),
|
||||||
scheduler=SchedulerPolicy(
|
scheduler=SchedulerPolicy(
|
||||||
tick_seconds=config.scheduler.tick_seconds,
|
tick_seconds=config.scheduler.tick_seconds,
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,7 @@ class SystemConfig:
|
||||||
max_leases_per_slot: int = 1
|
max_leases_per_slot: int = 1
|
||||||
launch_batch_size: int = 1
|
launch_batch_size: int = 1
|
||||||
scale_down_idle_seconds: int = 900
|
scale_down_idle_seconds: int = 900
|
||||||
|
termination_cooldown_seconds: int = 180
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -68,6 +69,7 @@ class CapacityConfig:
|
||||||
boot_timeout_seconds: int = 300
|
boot_timeout_seconds: int = 300
|
||||||
binding_timeout_seconds: int = 180
|
binding_timeout_seconds: int = 180
|
||||||
terminating_timeout_seconds: int = 300
|
terminating_timeout_seconds: int = 300
|
||||||
|
termination_cooldown_seconds: int = 180
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
|
|
@ -144,6 +144,7 @@ class CapacityPolicy(BaseModel):
|
||||||
boot_timeout_seconds: int
|
boot_timeout_seconds: int
|
||||||
binding_timeout_seconds: int
|
binding_timeout_seconds: int
|
||||||
terminating_timeout_seconds: int
|
terminating_timeout_seconds: int
|
||||||
|
termination_cooldown_seconds: int
|
||||||
|
|
||||||
|
|
||||||
class SchedulerPolicy(BaseModel):
|
class SchedulerPolicy(BaseModel):
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from .models import SlotState
|
from .models import SlotState
|
||||||
|
|
@ -50,6 +50,7 @@ class Reconciler:
|
||||||
self._clock = clock
|
self._clock = clock
|
||||||
self._metrics = metrics
|
self._metrics = metrics
|
||||||
self._binding_up_counts: dict[str, int] = {}
|
self._binding_up_counts: dict[str, int] = {}
|
||||||
|
self._termination_cooldown_until: datetime | None = None
|
||||||
|
|
||||||
def tick(self) -> None:
|
def tick(self) -> None:
|
||||||
"""Execute one reconciliation tick."""
|
"""Execute one reconciliation tick."""
|
||||||
|
|
@ -255,6 +256,8 @@ class Reconciler:
|
||||||
|
|
||||||
drain_timeout = self._config.capacity.drain_timeout_seconds
|
drain_timeout = self._config.capacity.drain_timeout_seconds
|
||||||
if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
|
if slot["lease_count"] == 0 or drain_duration >= drain_timeout:
|
||||||
|
if not self._can_start_termination():
|
||||||
|
return
|
||||||
instance_id = slot.get("instance_id")
|
instance_id = slot.get("instance_id")
|
||||||
if instance_id:
|
if instance_id:
|
||||||
try:
|
try:
|
||||||
|
|
@ -276,6 +279,7 @@ class Reconciler:
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||||
|
self._mark_termination_started()
|
||||||
log.info(
|
log.info(
|
||||||
"slot_terminating",
|
"slot_terminating",
|
||||||
extra={"slot_id": slot_id, "drain_duration": drain_duration},
|
extra={"slot_id": slot_id, "drain_duration": drain_duration},
|
||||||
|
|
@ -324,11 +328,15 @@ class Reconciler:
|
||||||
return (now - last_change).total_seconds()
|
return (now - last_change).total_seconds()
|
||||||
|
|
||||||
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None:
|
||||||
|
if not self._can_start_termination():
|
||||||
|
return
|
||||||
slot_id = slot["slot_id"]
|
slot_id = slot["slot_id"]
|
||||||
instance_id = slot.get("instance_id")
|
instance_id = slot.get("instance_id")
|
||||||
|
started_terminating = False
|
||||||
if instance_id:
|
if instance_id:
|
||||||
self._terminate_instance_best_effort(slot_id, instance_id)
|
self._terminate_instance_best_effort(slot_id, instance_id)
|
||||||
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
self._db.update_slot_state(slot_id, SlotState.TERMINATING)
|
||||||
|
started_terminating = True
|
||||||
else:
|
else:
|
||||||
self._db.update_slot_state(
|
self._db.update_slot_state(
|
||||||
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0
|
||||||
|
|
@ -340,6 +348,23 @@ class Reconciler:
|
||||||
if extra:
|
if extra:
|
||||||
payload.update(extra)
|
payload.update(extra)
|
||||||
log.warning(reason, extra=payload)
|
log.warning(reason, extra=payload)
|
||||||
|
if started_terminating:
|
||||||
|
self._mark_termination_started()
|
||||||
|
|
||||||
|
def _can_start_termination(self) -> bool:
|
||||||
|
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||||
|
if cooldown_seconds <= 0:
|
||||||
|
return True
|
||||||
|
if self._termination_cooldown_until is None:
|
||||||
|
return True
|
||||||
|
return self._clock.now() >= self._termination_cooldown_until
|
||||||
|
|
||||||
|
def _mark_termination_started(self) -> None:
|
||||||
|
cooldown_seconds = self._config.capacity.termination_cooldown_seconds
|
||||||
|
if cooldown_seconds <= 0:
|
||||||
|
self._termination_cooldown_until = None
|
||||||
|
return
|
||||||
|
self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds)
|
||||||
|
|
||||||
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
|
def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ def _make_env(
|
||||||
boot_timeout=300,
|
boot_timeout=300,
|
||||||
binding_timeout=180,
|
binding_timeout=180,
|
||||||
terminating_timeout=300,
|
terminating_timeout=300,
|
||||||
|
termination_cooldown=0,
|
||||||
):
|
):
|
||||||
clock = FakeClock()
|
clock = FakeClock()
|
||||||
db = StateDB(":memory:", clock=clock)
|
db = StateDB(":memory:", clock=clock)
|
||||||
|
|
@ -85,6 +86,7 @@ def _make_env(
|
||||||
boot_timeout_seconds=boot_timeout,
|
boot_timeout_seconds=boot_timeout,
|
||||||
binding_timeout_seconds=binding_timeout,
|
binding_timeout_seconds=binding_timeout,
|
||||||
terminating_timeout_seconds=terminating_timeout,
|
terminating_timeout_seconds=terminating_timeout,
|
||||||
|
termination_cooldown_seconds=termination_cooldown,
|
||||||
),
|
),
|
||||||
aws=AwsConfig(region="us-east-1"),
|
aws=AwsConfig(region="us-east-1"),
|
||||||
)
|
)
|
||||||
|
|
@ -195,3 +197,34 @@ def test_terminating_timeout_reissues_terminate_with_pacing() -> None:
|
||||||
# Immediate next tick should not retry yet because last_state_change was refreshed.
|
# Immediate next tick should not retry yet because last_state_change was refreshed.
|
||||||
reconciler.tick()
|
reconciler.tick()
|
||||||
assert runtime.terminate_calls == ["i-5"]
|
assert runtime.terminate_calls == ["i-5"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_termination_cooldown_spaces_terminations() -> None:
|
||||||
|
db, runtime, reconciler, clock = _make_env(termination_cooldown=30)
|
||||||
|
runtime.instances["i-6"] = _Instance(state="running", slot_id="slot001")
|
||||||
|
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-6", lease_count=0)
|
||||||
|
|
||||||
|
reconciler.tick()
|
||||||
|
slot = db.get_slot("slot001")
|
||||||
|
assert slot is not None
|
||||||
|
assert slot["state"] == SlotState.TERMINATING.value
|
||||||
|
assert runtime.terminate_calls == ["i-6"]
|
||||||
|
|
||||||
|
# New draining cycle before cooldown expires should not terminate yet.
|
||||||
|
runtime.instances["i-7"] = _Instance(state="running", slot_id="slot001")
|
||||||
|
db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-7", lease_count=0)
|
||||||
|
clock.advance(10)
|
||||||
|
reconciler.tick()
|
||||||
|
|
||||||
|
slot = db.get_slot("slot001")
|
||||||
|
assert slot is not None
|
||||||
|
assert slot["state"] == SlotState.DRAINING.value
|
||||||
|
assert runtime.terminate_calls == ["i-6"]
|
||||||
|
|
||||||
|
# After cooldown, termination proceeds.
|
||||||
|
clock.advance(21)
|
||||||
|
reconciler.tick()
|
||||||
|
slot = db.get_slot("slot001")
|
||||||
|
assert slot is not None
|
||||||
|
assert slot["state"] == SlotState.TERMINATING.value
|
||||||
|
assert runtime.terminate_calls == ["i-6", "i-7"]
|
||||||
|
|
|
||||||
|
|
@ -118,6 +118,7 @@ def test_effective_config_returns_capacity_and_scheduler() -> None:
|
||||||
body = response.json()
|
body = response.json()
|
||||||
assert body["capacity"]["max_slots"] == 8
|
assert body["capacity"]["max_slots"] == 8
|
||||||
assert body["capacity"]["idle_scale_down_seconds"] == 900
|
assert body["capacity"]["idle_scale_down_seconds"] == 900
|
||||||
|
assert body["capacity"]["termination_cooldown_seconds"] == 180
|
||||||
assert body["scheduler"]["tick_seconds"] == 3.0
|
assert body["scheduler"]["tick_seconds"] == 3.0
|
||||||
assert body["scheduler"]["reconcile_seconds"] == 15.0
|
assert body["scheduler"]["reconcile_seconds"] == 15.0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -205,6 +205,12 @@ in
|
||||||
description = "Max seconds between terminate retries while slot is terminating.";
|
description = "Max seconds between terminate retries while slot is terminating.";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
terminationCooldownSeconds = lib.mkOption {
|
||||||
|
type = lib.types.int;
|
||||||
|
default = 180;
|
||||||
|
description = "Minimum cooldown in seconds between starting slot terminations.";
|
||||||
|
};
|
||||||
|
|
||||||
launchBatchSize = lib.mkOption {
|
launchBatchSize = lib.mkOption {
|
||||||
type = lib.types.int;
|
type = lib.types.int;
|
||||||
default = 1;
|
default = 1;
|
||||||
|
|
@ -329,6 +335,7 @@ in
|
||||||
boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds}
|
boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds}
|
||||||
binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds}
|
binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds}
|
||||||
terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds}
|
terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds}
|
||||||
|
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
|
||||||
|
|
||||||
[security]
|
[security]
|
||||||
socket_mode = "${cfg.security.socketMode}"
|
socket_mode = "${cfg.security.socketMode}"
|
||||||
|
|
@ -343,6 +350,7 @@ in
|
||||||
max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot}
|
max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot}
|
||||||
launch_batch_size = ${toString cfg.capacity.launchBatchSize}
|
launch_batch_size = ${toString cfg.capacity.launchBatchSize}
|
||||||
scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds}
|
scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds}
|
||||||
|
termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
chown ${cfg.user}:${cfg.group} ${generatedConfigPath}
|
chown ${cfg.user}:${cfg.group} ${generatedConfigPath}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue