agent: complete plan05 closeout

This commit is contained in:
Abel Luck 2026-02-27 13:48:52 +01:00
parent 33ba248c49
commit 2f0fffa905
12 changed files with 1347 additions and 313 deletions

View file

@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, NoReturn
from fastapi import FastAPI, HTTPException, Request, Response
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from .models import (
CapacityHint,
@ -35,6 +36,12 @@ if TYPE_CHECKING:
log = logging.getLogger(__name__)
class SlotAdminRequest(BaseModel):
"""Admin action request that targets a slot."""
slot_id: str
def _parse_required_dt(value: str) -> datetime:
return datetime.fromisoformat(value)
@ -95,6 +102,8 @@ def create_app(
haproxy: HAProxyRuntime | None = None,
scheduler_running: Callable[[], bool] | None = None,
reconciler_running: Callable[[], bool] | None = None,
ready_check: Callable[[], bool] | None = None,
reconcile_now: Callable[[], dict[str, object] | None] | None = None,
) -> FastAPI:
"""Create the FastAPI application."""
app = FastAPI(title="nix-builder-autoscaler", version="0.1.0")
@ -191,6 +200,11 @@ def create_app(
@app.get("/health/ready", response_model=HealthResponse)
def health_ready() -> HealthResponse:
if ready_check is not None and not ready_check():
return JSONResponse( # type: ignore[return-value]
status_code=503,
content=HealthResponse(status="degraded").model_dump(mode="json"),
)
if scheduler_running is not None and not scheduler_running():
return JSONResponse( # type: ignore[return-value]
status_code=503,
@ -207,4 +221,83 @@ def create_app(
def metrics_endpoint() -> Response:
return Response(content=metrics.render(), media_type="text/plain")
@app.post("/v1/admin/drain")
def admin_drain(body: SlotAdminRequest, request: Request) -> dict[str, str]:
slot = db.get_slot(body.slot_id)
if slot is None:
_error_response(request, 404, "not_found", "Slot not found")
state = str(slot["state"])
if state == SlotState.DRAINING.value or state == SlotState.TERMINATING.value:
return {"status": "accepted", "slot_id": body.slot_id, "state": state}
allowed_states = {
SlotState.READY.value,
SlotState.BINDING.value,
SlotState.BOOTING.value,
SlotState.LAUNCHING.value,
}
if state not in allowed_states:
_error_response(
request,
409,
"invalid_state",
f"Cannot drain slot from state {state}",
)
db.update_slot_state(body.slot_id, SlotState.DRAINING, interruption_pending=0)
return {"status": "accepted", "slot_id": body.slot_id, "state": SlotState.DRAINING.value}
@app.post("/v1/admin/unquarantine")
def admin_unquarantine(body: SlotAdminRequest, request: Request) -> dict[str, str]:
slot = db.get_slot(body.slot_id)
if slot is None:
_error_response(request, 404, "not_found", "Slot not found")
state = str(slot["state"])
if state != SlotState.ERROR.value:
_error_response(
request,
409,
"invalid_state",
f"Cannot unquarantine slot from state {state}",
)
db.update_slot_state(
body.slot_id,
SlotState.EMPTY,
instance_id=None,
instance_ip=None,
instance_launch_time=None,
lease_count=0,
cooldown_until=None,
interruption_pending=0,
)
return {"status": "accepted", "slot_id": body.slot_id, "state": SlotState.EMPTY.value}
@app.post("/v1/admin/reconcile-now")
def admin_reconcile_now(request: Request) -> dict[str, object]:
if reconcile_now is None:
_error_response(
request,
503,
"not_configured",
"Reconcile trigger not configured",
retryable=True,
)
try:
result = reconcile_now()
except Exception:
log.exception("admin_reconcile_now_failed")
_error_response(
request,
500,
"reconcile_failed",
"Reconcile tick failed",
retryable=True,
)
payload: dict[str, object] = {"status": "accepted"}
if isinstance(result, dict):
payload.update(result)
return payload
return app