From 44bc99ab85a2501e1add8ca8265377a08f9c1f72 Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Fri, 27 Feb 2026 18:37:58 +0100 Subject: [PATCH] add termination cooldown for slot scale-down --- agent/nix_builder_autoscaler/api.py | 1 + agent/nix_builder_autoscaler/config.py | 2 ++ agent/nix_builder_autoscaler/models.py | 1 + agent/nix_builder_autoscaler/reconciler.py | 27 ++++++++++++++- .../tests/test_reconciler.py | 33 +++++++++++++++++++ .../tests/test_reservations_api.py | 1 + .../nixos/services/nix-builder-autoscaler.nix | 8 +++++ 7 files changed, 72 insertions(+), 1 deletion(-) diff --git a/agent/nix_builder_autoscaler/api.py b/agent/nix_builder_autoscaler/api.py index 55a92e0..5f3fc9e 100644 --- a/agent/nix_builder_autoscaler/api.py +++ b/agent/nix_builder_autoscaler/api.py @@ -197,6 +197,7 @@ def create_app( boot_timeout_seconds=config.capacity.boot_timeout_seconds, binding_timeout_seconds=config.capacity.binding_timeout_seconds, terminating_timeout_seconds=config.capacity.terminating_timeout_seconds, + termination_cooldown_seconds=config.capacity.termination_cooldown_seconds, ), scheduler=SchedulerPolicy( tick_seconds=config.scheduler.tick_seconds, diff --git a/agent/nix_builder_autoscaler/config.py b/agent/nix_builder_autoscaler/config.py index b465d7a..2cc2a72 100644 --- a/agent/nix_builder_autoscaler/config.py +++ b/agent/nix_builder_autoscaler/config.py @@ -50,6 +50,7 @@ class SystemConfig: max_leases_per_slot: int = 1 launch_batch_size: int = 1 scale_down_idle_seconds: int = 900 + termination_cooldown_seconds: int = 180 @dataclass @@ -68,6 +69,7 @@ class CapacityConfig: boot_timeout_seconds: int = 300 binding_timeout_seconds: int = 180 terminating_timeout_seconds: int = 300 + termination_cooldown_seconds: int = 180 @dataclass diff --git a/agent/nix_builder_autoscaler/models.py b/agent/nix_builder_autoscaler/models.py index e8bd3e6..9d186f6 100644 --- a/agent/nix_builder_autoscaler/models.py +++ b/agent/nix_builder_autoscaler/models.py @@ -144,6 +144,7 @@ class CapacityPolicy(BaseModel): boot_timeout_seconds: int binding_timeout_seconds: int terminating_timeout_seconds: int + termination_cooldown_seconds: int class SchedulerPolicy(BaseModel): diff --git a/agent/nix_builder_autoscaler/reconciler.py b/agent/nix_builder_autoscaler/reconciler.py index 676c1b6..d632f24 100644 --- a/agent/nix_builder_autoscaler/reconciler.py +++ b/agent/nix_builder_autoscaler/reconciler.py @@ -10,7 +10,7 @@ from __future__ import annotations import contextlib import logging import time -from datetime import datetime +from datetime import datetime, timedelta from typing import TYPE_CHECKING from .models import SlotState @@ -50,6 +50,7 @@ class Reconciler: self._clock = clock self._metrics = metrics self._binding_up_counts: dict[str, int] = {} + self._termination_cooldown_until: datetime | None = None def tick(self) -> None: """Execute one reconciliation tick.""" @@ -255,6 +256,8 @@ class Reconciler: drain_timeout = self._config.capacity.drain_timeout_seconds if slot["lease_count"] == 0 or drain_duration >= drain_timeout: + if not self._can_start_termination(): + return instance_id = slot.get("instance_id") if instance_id: try: @@ -276,6 +279,7 @@ class Reconciler: exc_info=True, ) self._db.update_slot_state(slot_id, SlotState.TERMINATING) + self._mark_termination_started() log.info( "slot_terminating", extra={"slot_id": slot_id, "drain_duration": drain_duration}, @@ -324,11 +328,15 @@ class Reconciler: return (now - last_change).total_seconds() def _begin_termination(self, slot: dict, reason: str, extra: dict | None = None) -> None: + if not self._can_start_termination(): + return slot_id = slot["slot_id"] instance_id = slot.get("instance_id") + started_terminating = False if instance_id: self._terminate_instance_best_effort(slot_id, instance_id) self._db.update_slot_state(slot_id, SlotState.TERMINATING) + started_terminating = True else: self._db.update_slot_state( slot_id, SlotState.EMPTY, instance_id=None, instance_ip=None, lease_count=0 @@ -340,6 +348,23 @@ class Reconciler: if extra: payload.update(extra) log.warning(reason, extra=payload) + if started_terminating: + self._mark_termination_started() + + def _can_start_termination(self) -> bool: + cooldown_seconds = self._config.capacity.termination_cooldown_seconds + if cooldown_seconds <= 0: + return True + if self._termination_cooldown_until is None: + return True + return self._clock.now() >= self._termination_cooldown_until + + def _mark_termination_started(self) -> None: + cooldown_seconds = self._config.capacity.termination_cooldown_seconds + if cooldown_seconds <= 0: + self._termination_cooldown_until = None + return + self._termination_cooldown_until = self._clock.now() + timedelta(seconds=cooldown_seconds) def _terminate_instance_best_effort(self, slot_id: str, instance_id: str) -> None: try: diff --git a/agent/nix_builder_autoscaler/tests/test_reconciler.py b/agent/nix_builder_autoscaler/tests/test_reconciler.py index 34af294..257dea7 100644 --- a/agent/nix_builder_autoscaler/tests/test_reconciler.py +++ b/agent/nix_builder_autoscaler/tests/test_reconciler.py @@ -71,6 +71,7 @@ def _make_env( boot_timeout=300, binding_timeout=180, terminating_timeout=300, + termination_cooldown=0, ): clock = FakeClock() db = StateDB(":memory:", clock=clock) @@ -85,6 +86,7 @@ def _make_env( boot_timeout_seconds=boot_timeout, binding_timeout_seconds=binding_timeout, terminating_timeout_seconds=terminating_timeout, + termination_cooldown_seconds=termination_cooldown, ), aws=AwsConfig(region="us-east-1"), ) @@ -195,3 +197,34 @@ def test_terminating_timeout_reissues_terminate_with_pacing() -> None: # Immediate next tick should not retry yet because last_state_change was refreshed. reconciler.tick() assert runtime.terminate_calls == ["i-5"] + + +def test_termination_cooldown_spaces_terminations() -> None: + db, runtime, reconciler, clock = _make_env(termination_cooldown=30) + runtime.instances["i-6"] = _Instance(state="running", slot_id="slot001") + db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-6", lease_count=0) + + reconciler.tick() + slot = db.get_slot("slot001") + assert slot is not None + assert slot["state"] == SlotState.TERMINATING.value + assert runtime.terminate_calls == ["i-6"] + + # New draining cycle before cooldown expires should not terminate yet. + runtime.instances["i-7"] = _Instance(state="running", slot_id="slot001") + db.update_slot_state("slot001", SlotState.DRAINING, instance_id="i-7", lease_count=0) + clock.advance(10) + reconciler.tick() + + slot = db.get_slot("slot001") + assert slot is not None + assert slot["state"] == SlotState.DRAINING.value + assert runtime.terminate_calls == ["i-6"] + + # After cooldown, termination proceeds. + clock.advance(21) + reconciler.tick() + slot = db.get_slot("slot001") + assert slot is not None + assert slot["state"] == SlotState.TERMINATING.value + assert runtime.terminate_calls == ["i-6", "i-7"] diff --git a/agent/nix_builder_autoscaler/tests/test_reservations_api.py b/agent/nix_builder_autoscaler/tests/test_reservations_api.py index b4af2a2..14c807e 100644 --- a/agent/nix_builder_autoscaler/tests/test_reservations_api.py +++ b/agent/nix_builder_autoscaler/tests/test_reservations_api.py @@ -118,6 +118,7 @@ def test_effective_config_returns_capacity_and_scheduler() -> None: body = response.json() assert body["capacity"]["max_slots"] == 8 assert body["capacity"]["idle_scale_down_seconds"] == 900 + assert body["capacity"]["termination_cooldown_seconds"] == 180 assert body["scheduler"]["tick_seconds"] == 3.0 assert body["scheduler"]["reconcile_seconds"] == 15.0 diff --git a/nix/modules/nixos/services/nix-builder-autoscaler.nix b/nix/modules/nixos/services/nix-builder-autoscaler.nix index 7697ac5..1ad4368 100644 --- a/nix/modules/nixos/services/nix-builder-autoscaler.nix +++ b/nix/modules/nixos/services/nix-builder-autoscaler.nix @@ -205,6 +205,12 @@ in description = "Max seconds between terminate retries while slot is terminating."; }; + terminationCooldownSeconds = lib.mkOption { + type = lib.types.int; + default = 180; + description = "Minimum cooldown in seconds between starting slot terminations."; + }; + launchBatchSize = lib.mkOption { type = lib.types.int; default = 1; @@ -329,6 +335,7 @@ in boot_timeout_seconds = ${toString cfg.capacity.bootTimeoutSeconds} binding_timeout_seconds = ${toString cfg.capacity.bindingTimeoutSeconds} terminating_timeout_seconds = ${toString cfg.capacity.terminatingTimeoutSeconds} + termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds} [security] socket_mode = "${cfg.security.socketMode}" @@ -343,6 +350,7 @@ in max_leases_per_slot = ${toString cfg.capacity.maxLeasesPerSlot} launch_batch_size = ${toString cfg.capacity.launchBatchSize} scale_down_idle_seconds = ${toString cfg.capacity.idleScaleDownSeconds} + termination_cooldown_seconds = ${toString cfg.capacity.terminationCooldownSeconds} EOF chown ${cfg.user}:${cfg.group} ${generatedConfigPath}