add slot ttl output with effective timeout config

This commit is contained in:
Abel Luck 2026-02-27 16:43:52 +01:00
parent d8afde8b18
commit 8fdf2d5e5b
5 changed files with 211 additions and 4 deletions

View file

@ -14,12 +14,15 @@ from pydantic import BaseModel
from .models import ( from .models import (
CapacityHint, CapacityHint,
CapacityPolicy,
EffectiveConfigResponse,
ErrorDetail, ErrorDetail,
ErrorResponse, ErrorResponse,
HealthResponse, HealthResponse,
ReservationPhase, ReservationPhase,
ReservationRequest, ReservationRequest,
ReservationResponse, ReservationResponse,
SchedulerPolicy,
SlotInfo, SlotInfo,
SlotState, SlotState,
StateSummary, StateSummary,
@ -180,6 +183,27 @@ def create_app(
summary = db.get_state_summary() summary = db.get_state_summary()
return StateSummary.model_validate(summary) return StateSummary.model_validate(summary)
@app.get("/v1/config/effective", response_model=EffectiveConfigResponse)
def effective_config() -> EffectiveConfigResponse:
return EffectiveConfigResponse(
capacity=CapacityPolicy(
min_slots=config.capacity.min_slots,
max_slots=config.capacity.max_slots,
target_warm_slots=config.capacity.target_warm_slots,
max_leases_per_slot=config.capacity.max_leases_per_slot,
idle_scale_down_seconds=config.capacity.idle_scale_down_seconds,
drain_timeout_seconds=config.capacity.drain_timeout_seconds,
launch_timeout_seconds=config.capacity.launch_timeout_seconds,
boot_timeout_seconds=config.capacity.boot_timeout_seconds,
binding_timeout_seconds=config.capacity.binding_timeout_seconds,
terminating_timeout_seconds=config.capacity.terminating_timeout_seconds,
),
scheduler=SchedulerPolicy(
tick_seconds=config.scheduler.tick_seconds,
reconcile_seconds=config.scheduler.reconcile_seconds,
),
)
@app.post("/v1/hints/capacity") @app.post("/v1/hints/capacity")
def capacity_hint(hint: CapacityHint) -> dict[str, str]: def capacity_hint(hint: CapacityHint) -> dict[str, str]:
log.info( log.info(

View file

@ -7,6 +7,7 @@ import http.client
import json import json
import socket import socket
from collections.abc import Sequence from collections.abc import Sequence
from datetime import UTC, datetime
from typing import Any from typing import Any
@ -64,7 +65,81 @@ def _print_table(headers: Sequence[str], rows: Sequence[Sequence[str]]) -> None:
print(" ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))) print(" ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row)))
def _print_slots(data: list[dict[str, Any]]) -> None: def _format_duration(seconds: float) -> str:
total = int(max(0, round(seconds)))
hours, rem = divmod(total, 3600)
minutes, secs = divmod(rem, 60)
if hours > 0:
return f"{hours}h{minutes:02d}m"
if minutes > 0:
return f"{minutes}m{secs:02d}s"
return f"{secs}s"
def _format_timeout_ttl(timeout_seconds: float, age_seconds: float) -> str:
remaining = timeout_seconds - age_seconds
if remaining <= 0:
return "due"
return _format_duration(remaining)
def _slot_age_seconds(slot: dict[str, Any]) -> float | None:
raw = slot.get("last_state_change")
if not isinstance(raw, str):
return None
try:
dt = datetime.fromisoformat(raw)
except ValueError:
return None
if dt.tzinfo is None:
dt = dt.replace(tzinfo=UTC)
return (datetime.now(UTC) - dt).total_seconds()
def _slot_ttl(slot: dict[str, Any], policy: dict[str, Any] | None, active_slots: int) -> str:
if policy is None:
return "-"
capacity = policy.get("capacity")
scheduler = policy.get("scheduler")
if not isinstance(capacity, dict) or not isinstance(scheduler, dict):
return "-"
state = str(slot.get("state", ""))
lease_count = int(slot.get("lease_count", 0))
age_seconds = _slot_age_seconds(slot)
if state in {"empty", "error"}:
return "-"
if age_seconds is None:
return "?"
if state == "launching":
return _format_timeout_ttl(float(capacity.get("launch_timeout_seconds", 0)), age_seconds)
if state == "booting":
return _format_timeout_ttl(float(capacity.get("boot_timeout_seconds", 0)), age_seconds)
if state == "binding":
return _format_timeout_ttl(float(capacity.get("binding_timeout_seconds", 0)), age_seconds)
if state == "terminating":
return _format_timeout_ttl(
float(capacity.get("terminating_timeout_seconds", 0)), age_seconds
)
if state == "draining":
if lease_count == 0:
return f"<={_format_duration(float(scheduler.get('reconcile_seconds', 0)))}"
return _format_timeout_ttl(float(capacity.get("drain_timeout_seconds", 0)), age_seconds)
if state == "ready":
if lease_count > 0:
return "-"
min_slots = int(capacity.get("min_slots", 0))
if active_slots <= min_slots:
return "pinned"
return _format_timeout_ttl(float(capacity.get("idle_scale_down_seconds", 0)), age_seconds)
return "-"
def _print_slots(data: list[dict[str, Any]], policy: dict[str, Any] | None = None) -> None:
active_slots = sum(1 for slot in data if str(slot.get("state", "")) not in {"empty", "error"})
rows: list[list[str]] = [] rows: list[list[str]] = []
for slot in data: for slot in data:
rows.append( rows.append(
@ -74,9 +149,10 @@ def _print_slots(data: list[dict[str, Any]]) -> None:
str(slot.get("instance_id") or "-"), str(slot.get("instance_id") or "-"),
str(slot.get("instance_ip") or "-"), str(slot.get("instance_ip") or "-"),
str(slot.get("lease_count", 0)), str(slot.get("lease_count", 0)),
_slot_ttl(slot, policy, active_slots),
] ]
) )
_print_table(["slot_id", "state", "instance_id", "ip", "leases"], rows) _print_table(["slot_id", "state", "instance_id", "ip", "leases", "ttl"], rows)
def _print_reservations(data: list[dict[str, Any]]) -> None: def _print_reservations(data: list[dict[str, Any]]) -> None:
@ -117,6 +193,18 @@ def _print_status_summary(data: dict[str, Any]) -> None:
_print_table(["metric", "value"], rows) _print_table(["metric", "value"], rows)
def _get_effective_config(socket_path: str) -> dict[str, Any] | None:
try:
status, data = _uds_request(socket_path, "GET", "/v1/config/effective")
except OSError:
return None
if status < 200 or status >= 300:
return None
if isinstance(data, dict):
return data
return None
def _bulk_slot_action(socket_path: str, action: str) -> dict[str, Any]: def _bulk_slot_action(socket_path: str, action: str) -> dict[str, Any]:
if action == "drain": if action == "drain":
eligible_states = {"ready"} eligible_states = {"ready"}
@ -307,7 +395,8 @@ def main() -> None:
print(json.dumps(data, indent=2)) print(json.dumps(data, indent=2))
elif args.command == "slots": elif args.command == "slots":
if isinstance(data, list): if isinstance(data, list):
_print_slots(data) policy = _get_effective_config(args.socket)
_print_slots(data, policy)
else: else:
_print_error(data) _print_error(data)
raise SystemExit(1) raise SystemExit(1)

View file

@ -131,6 +131,35 @@ class StateSummary(BaseModel):
haproxy: HaproxySummary = Field(default_factory=HaproxySummary) haproxy: HaproxySummary = Field(default_factory=HaproxySummary)
class CapacityPolicy(BaseModel):
"""Effective capacity timeout and sizing policy."""
min_slots: int
max_slots: int
target_warm_slots: int
max_leases_per_slot: int
idle_scale_down_seconds: int
drain_timeout_seconds: int
launch_timeout_seconds: int
boot_timeout_seconds: int
binding_timeout_seconds: int
terminating_timeout_seconds: int
class SchedulerPolicy(BaseModel):
"""Effective scheduler timing policy."""
tick_seconds: float
reconcile_seconds: float
class EffectiveConfigResponse(BaseModel):
"""GET /v1/config/effective response."""
capacity: CapacityPolicy
scheduler: SchedulerPolicy
class ErrorDetail(BaseModel): class ErrorDetail(BaseModel):
"""Structured error detail.""" """Structured error detail."""

View file

@ -2,10 +2,12 @@
from __future__ import annotations from __future__ import annotations
from datetime import UTC, datetime, timedelta
import pytest import pytest
from nix_builder_autoscaler import cli from nix_builder_autoscaler import cli
from nix_builder_autoscaler.cli import _parse_args, _print_status_summary from nix_builder_autoscaler.cli import _parse_args, _print_slots, _print_status_summary, _slot_ttl
def test_parse_args_without_command_prints_help_and_exits_zero( def test_parse_args_without_command_prints_help_and_exits_zero(
@ -95,3 +97,55 @@ def test_bulk_unquarantine_only_targets_error_slots(monkeypatch: pytest.MonkeyPa
assert summary["succeeded"] == 1 assert summary["succeeded"] == 1
assert summary["failed"] == 0 assert summary["failed"] == 0
assert summary["skipped"] == 1 assert summary["skipped"] == 1
def test_slot_ttl_ready_pinned_at_min_slots() -> None:
now = datetime.now(UTC)
slot = {
"state": "ready",
"lease_count": 0,
"last_state_change": (now - timedelta(seconds=60)).isoformat(),
}
policy = {
"capacity": {
"min_slots": 1,
"idle_scale_down_seconds": 900,
"launch_timeout_seconds": 300,
"boot_timeout_seconds": 300,
"binding_timeout_seconds": 180,
"drain_timeout_seconds": 120,
"terminating_timeout_seconds": 300,
},
"scheduler": {"reconcile_seconds": 15.0},
}
assert _slot_ttl(slot, policy, active_slots=1) == "pinned"
def test_print_slots_includes_ttl_column(capsys: pytest.CaptureFixture[str]) -> None:
now = datetime.now(UTC)
slots = [
{
"slot_id": "slot001",
"state": "launching",
"instance_id": "i-123",
"instance_ip": None,
"lease_count": 0,
"last_state_change": (now - timedelta(seconds=20)).isoformat(),
}
]
policy = {
"capacity": {
"min_slots": 0,
"idle_scale_down_seconds": 900,
"launch_timeout_seconds": 300,
"boot_timeout_seconds": 300,
"binding_timeout_seconds": 180,
"drain_timeout_seconds": 120,
"terminating_timeout_seconds": 300,
},
"scheduler": {"reconcile_seconds": 15.0},
}
_print_slots(slots, policy)
out = capsys.readouterr().out
assert "ttl" in out
assert "slot001" in out

View file

@ -111,6 +111,17 @@ def test_state_summary_returns_counts() -> None:
assert body["slots"]["empty"] == 3 assert body["slots"]["empty"] == 3
def test_effective_config_returns_capacity_and_scheduler() -> None:
client, _, _, _ = _make_client()
response = client.get("/v1/config/effective")
assert response.status_code == 200
body = response.json()
assert body["capacity"]["max_slots"] == 8
assert body["capacity"]["idle_scale_down_seconds"] == 900
assert body["scheduler"]["tick_seconds"] == 3.0
assert body["scheduler"]["reconcile_seconds"] == 15.0
def test_health_live_returns_ok() -> None: def test_health_live_returns_ok() -> None:
client, _, _, _ = _make_client() client, _, _, _ = _make_client()
response = client.get("/health/live") response = client.get("/health/live")