improve autoscalerctl help and add bulk slot actions

accept tailscale ssh banner in haproxy health checks
2026-02-27 16:25:54 +01:00 · 2026-02-27 16:25:54 +01:00
4 changed files with 250 additions and 11 deletions
--- a/agent/nix_builder_autoscaler/cli.py
+++ b/agent/nix_builder_autoscaler/cli.py
@ -94,13 +94,131 @@ def _print_reservations(data: list[dict[str, Any]]) -> None:
    _print_table(["reservation_id", "phase", "system", "slot", "instance_id"], rows)
-def _parse_args() -> argparse.Namespace:
+def _print_status_summary(data: dict[str, Any]) -> None:
    slots = data.get("slots", {})
    reservations = data.get("reservations", {})
    ec2 = data.get("ec2", {})
    haproxy = data.get("haproxy", {})
    rows = [
        ["slots.total", str(slots.get("total", 0))],
        ["slots.ready", str(slots.get("ready", 0))],
        ["slots.launching", str(slots.get("launching", 0))],
        ["slots.booting", str(slots.get("booting", 0))],
        ["slots.binding", str(slots.get("binding", 0))],
        ["slots.terminating", str(slots.get("terminating", 0))],
        ["slots.empty", str(slots.get("empty", 0))],
        ["slots.error", str(slots.get("error", 0))],
        ["reservations.pending", str(reservations.get("pending", 0))],
        ["reservations.ready", str(reservations.get("ready", 0))],
        ["reservations.failed", str(reservations.get("failed", 0))],
        ["ec2.api_ok", str(ec2.get("api_ok", False))],
        ["haproxy.socket_ok", str(haproxy.get("socket_ok", False))],
    ]
    _print_table(["metric", "value"], rows)
 def _bulk_slot_action(socket_path: str, action: str) -> dict[str, Any]:
    if action == "drain":
        eligible_states = {"ready"}
        action_path = "/v1/admin/drain"
    elif action == "unquarantine":
        eligible_states = {"error"}
        action_path = "/v1/admin/unquarantine"
    else:
        msg = f"unknown bulk action: {action}"
        raise ValueError(msg)
    status, data = _uds_request(socket_path, "GET", "/v1/slots")
    if status < 200 or status >= 300 or not isinstance(data, list):
        msg = "failed to list slots for bulk action"
        raise RuntimeError(msg)
    results: list[dict[str, Any]] = []
    summary: dict[str, Any] = {
        "action": action,
        "matched": 0,
        "attempted": 0,
        "succeeded": 0,
        "failed": 0,
        "skipped": 0,
        "results": results,
    }
    for slot in data:
        slot_id = str(slot.get("slot_id", ""))
        state = str(slot.get("state", ""))
        if not slot_id:
            continue
        if state not in eligible_states:
            summary["skipped"] += 1
            results.append(
                {
                    "slot_id": slot_id,
                    "state": state,
                    "result": "skipped",
                    "reason": "ineligible_state",
                }
            )
            continue
        summary["matched"] += 1
        summary["attempted"] += 1
        try:
            action_status, action_data = _uds_request(
                socket_path,
                "POST",
                action_path,
                body={"slot_id": slot_id},
            )
        except OSError as err:
            summary["failed"] += 1
            results.append(
                {
                    "slot_id": slot_id,
                    "state": state,
                    "result": "failed",
                    "error": str(err),
                }
            )
            continue
        if 200 <= action_status < 300:
            summary["succeeded"] += 1
            results.append(
                {
                    "slot_id": slot_id,
                    "state": state,
                    "result": "ok",
                }
            )
        else:
            summary["failed"] += 1
            results.append(
                {
                    "slot_id": slot_id,
                    "state": state,
                    "result": "failed",
                    "status": action_status,
                    "response": action_data,
                }
            )
    return summary
 def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog="autoscalerctl", description="Autoscaler CLI")
    parser.add_argument(
        "--socket",
        default="/run/nix-builder-autoscaler/daemon.sock",
        help="Daemon Unix socket path",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output JSON for status command.",
    )
    subparsers = parser.add_subparsers(dest="command")
    subparsers.add_parser("status", help="Show state summary")
    subparsers.add_parser("slots", help="List slots")
@ -110,8 +228,14 @@ def _parse_args() -> argparse.Namespace:
    parser_drain.add_argument("slot_id")
    parser_unq = subparsers.add_parser("unquarantine", help="Unquarantine a slot")
    parser_unq.add_argument("slot_id")
    subparsers.add_parser("drain-all", help="Drain all eligible slots (state=ready)")
    subparsers.add_parser("unquarantine-all", help="Unquarantine all error slots")
    subparsers.add_parser("reconcile-now", help="Trigger immediate reconcile tick")
-    return parser.parse_args()
+    args = parser.parse_args(argv)
    if not args.command:
        parser.print_help()
        raise SystemExit(0)
    return args
 def _print_error(data: object) -> None:
@ -124,8 +248,19 @@ def _print_error(data: object) -> None:
 def main() -> None:
    """Entry point for the autoscalerctl CLI."""
    args = _parse_args()
-    if not args.command:
+
-        raise SystemExit(1)
+    if args.command in {"drain-all", "unquarantine-all"}:
        action = "drain" if args.command == "drain-all" else "unquarantine"
        try:
            summary = _bulk_slot_action(args.socket, action)
        except OSError as err:
            print(f"Error: cannot connect to daemon at {args.socket}")
            raise SystemExit(1) from err
        except RuntimeError as err:
            print(str(err))
            raise SystemExit(1) from err
        print(json.dumps(summary, indent=2))
        raise SystemExit(0 if summary["failed"] == 0 else 1)
    method = "GET"
    path = ""
@ -160,7 +295,15 @@ def main() -> None:
        _print_error(data)
        raise SystemExit(1)
-    if args.command in {"status", "drain", "unquarantine", "reconcile-now"}:
+    if args.command == "status":
        if not isinstance(data, dict):
            _print_error(data)
            raise SystemExit(1)
        if args.json:
            print(json.dumps(data, indent=2))
        else:
            _print_status_summary(data)
    elif args.command in {"drain", "unquarantine", "reconcile-now"}:
        print(json.dumps(data, indent=2))
    elif args.command == "slots":
        if isinstance(data, list):
--- a/agent/nix_builder_autoscaler/tests/test_cli.py
+++ b/agent/nix_builder_autoscaler/tests/test_cli.py
@ -0,0 +1,97 @@
 """Unit tests for autoscalerctl CLI argument and display behavior."""
 from __future__ import annotations
 import pytest
 from nix_builder_autoscaler import cli
 from nix_builder_autoscaler.cli import _parse_args, _print_status_summary
 def test_parse_args_without_command_prints_help_and_exits_zero(
    capsys: pytest.CaptureFixture[str],
 ) -> None:
    with pytest.raises(SystemExit) as exc:
        _parse_args([])
    assert exc.value.code == 0
    captured = capsys.readouterr()
    assert "Autoscaler CLI" in captured.out
    assert "status" in captured.out
 def test_parse_args_json_status() -> None:
    args = _parse_args(["--json", "status"])
    assert args.command == "status"
    assert args.json is True
 def test_parse_args_bulk_commands() -> None:
    assert _parse_args(["drain-all"]).command == "drain-all"
    assert _parse_args(["unquarantine-all"]).command == "unquarantine-all"
 def test_print_status_summary_renders_metrics_table(capsys: pytest.CaptureFixture[str]) -> None:
    _print_status_summary(
        {
            "slots": {
                "total": 4,
                "ready": 1,
                "launching": 1,
                "booting": 1,
                "binding": 0,
                "terminating": 0,
                "empty": 1,
                "error": 0,
            },
            "reservations": {"pending": 2, "ready": 1, "failed": 0},
            "ec2": {"api_ok": True},
            "haproxy": {"socket_ok": True},
        }
    )
    out = capsys.readouterr().out
    assert "metric" in out
    assert "slots.total" in out
    assert "reservations.pending" in out
    assert "haproxy.socket_ok" in out
 def test_bulk_drain_only_targets_ready_slots(monkeypatch: pytest.MonkeyPatch) -> None:
    def _fake_request(socket_path: str, method: str, path: str, body=None):  # noqa: ANN001
        assert socket_path == "/tmp/sock"
        if method == "GET" and path == "/v1/slots":
            return 200, [
                {"slot_id": "slot001", "state": "ready"},
                {"slot_id": "slot002", "state": "booting"},
            ]
        if method == "POST" and path == "/v1/admin/drain" and body == {"slot_id": "slot001"}:
            return 200, {"state": "draining"}
        raise AssertionError(f"unexpected request: {method} {path} {body}")
    monkeypatch.setattr(cli, "_uds_request", _fake_request)
    summary = cli._bulk_slot_action("/tmp/sock", "drain")
    assert summary["matched"] == 1
    assert summary["attempted"] == 1
    assert summary["succeeded"] == 1
    assert summary["failed"] == 0
    assert summary["skipped"] == 1
 def test_bulk_unquarantine_only_targets_error_slots(monkeypatch: pytest.MonkeyPatch) -> None:
    def _fake_request(socket_path: str, method: str, path: str, body=None):  # noqa: ANN001
        assert socket_path == "/tmp/sock"
        if method == "GET" and path == "/v1/slots":
            return 200, [
                {"slot_id": "slot001", "state": "error"},
                {"slot_id": "slot002", "state": "ready"},
            ]
        if method == "POST" and path == "/v1/admin/unquarantine" and body == {"slot_id": "slot001"}:
            return 200, {"state": "empty"}
        raise AssertionError(f"unexpected request: {method} {path} {body}")
    monkeypatch.setattr(cli, "_uds_request", _fake_request)
    summary = cli._bulk_slot_action("/tmp/sock", "unquarantine")
    assert summary["matched"] == 1
    assert summary["attempted"] == 1
    assert summary["succeeded"] == 1
    assert summary["failed"] == 0
    assert summary["skipped"] == 1
--- a/flake.nix
+++ b/flake.nix
@ -62,7 +62,8 @@
          );
      mkBuildbotAutoscaleExtPythonPackagesExtension = pkgs: pyFinal: _pyPrev: {
        buildbot-autoscale-ext =
-          pyFinal.toPythonModule (mkBuildbotExtPythonSet pkgs pyFinal.python)."buildbot-autoscale-ext";
+          pyFinal.toPythonModule
            (mkBuildbotExtPythonSet pkgs pyFinal.python)."buildbot-autoscale-ext";
      };
    in
    {
@ -94,8 +95,7 @@
                  pyprojectOverrides
                ]
              );
-          buildbotExtPythonSet =
+          buildbotExtPythonSet = mkBuildbotExtPythonSet pkgs pkgs.python3;
            mkBuildbotExtPythonSet pkgs pkgs.python3;
          venv = agentPythonSet.mkVirtualEnv "nix-builder-autoscaler-env" agentWorkspace.deps.default;
          buildbotExtVenv = buildbotExtPythonSet.mkVirtualEnv "buildbot-autoscale-ext-env" buildbotExtWorkspace.deps.default;
        in
@ -145,8 +145,7 @@
                  pyprojectOverrides
                ]
              );
-          buildbotExtPythonSet =
+          buildbotExtPythonSet = mkBuildbotExtPythonSet pkgs pkgs.python3;
            mkBuildbotExtPythonSet pkgs pkgs.python3;
          testVenv = agentPythonSet.mkVirtualEnv "nix-builder-autoscaler-test-env" {
            nix-builder-autoscaler = [ "dev" ];
          };
--- a/nix/modules/nixos/services/nix-builder-autoscaler.nix
+++ b/nix/modules/nixos/services/nix-builder-autoscaler.nix
@ -275,7 +275,7 @@ in
        backend ${cfg.haproxy.backend}
          balance leastconn
          option tcp-check
-          tcp-check expect rstring SSH-2\\.0-OpenSSH.*
+          tcp-check expect rstring SSH-2\\.0-.*
          ${lib.concatMapStrings (
            i:
            "server ${cfg.haproxy.slotPrefix}${lib.fixedWidthNumber 3 i} 127.0.0.2:22 disabled check inter 5s fall 2 rise 2 maxconn 2\n          "
Author	SHA1	Message	Date
Abel Luck	57b4df2a17	improve autoscalerctl help and add bulk slot actions	2026-02-27 16:25:54 +01:00
Abel Luck	f0f6020d6a	accept tailscale ssh banner in haproxy health checks	2026-02-27 16:25:54 +01:00