diff --git a/agent/nix_builder_autoscaler/cli.py b/agent/nix_builder_autoscaler/cli.py index 28d9645..eb3ff70 100644 --- a/agent/nix_builder_autoscaler/cli.py +++ b/agent/nix_builder_autoscaler/cli.py @@ -94,13 +94,131 @@ def _print_reservations(data: list[dict[str, Any]]) -> None: _print_table(["reservation_id", "phase", "system", "slot", "instance_id"], rows) -def _parse_args() -> argparse.Namespace: +def _print_status_summary(data: dict[str, Any]) -> None: + slots = data.get("slots", {}) + reservations = data.get("reservations", {}) + ec2 = data.get("ec2", {}) + haproxy = data.get("haproxy", {}) + rows = [ + ["slots.total", str(slots.get("total", 0))], + ["slots.ready", str(slots.get("ready", 0))], + ["slots.launching", str(slots.get("launching", 0))], + ["slots.booting", str(slots.get("booting", 0))], + ["slots.binding", str(slots.get("binding", 0))], + ["slots.terminating", str(slots.get("terminating", 0))], + ["slots.empty", str(slots.get("empty", 0))], + ["slots.error", str(slots.get("error", 0))], + ["reservations.pending", str(reservations.get("pending", 0))], + ["reservations.ready", str(reservations.get("ready", 0))], + ["reservations.failed", str(reservations.get("failed", 0))], + ["ec2.api_ok", str(ec2.get("api_ok", False))], + ["haproxy.socket_ok", str(haproxy.get("socket_ok", False))], + ] + _print_table(["metric", "value"], rows) + + +def _bulk_slot_action(socket_path: str, action: str) -> dict[str, Any]: + if action == "drain": + eligible_states = {"ready"} + action_path = "/v1/admin/drain" + elif action == "unquarantine": + eligible_states = {"error"} + action_path = "/v1/admin/unquarantine" + else: + msg = f"unknown bulk action: {action}" + raise ValueError(msg) + + status, data = _uds_request(socket_path, "GET", "/v1/slots") + if status < 200 or status >= 300 or not isinstance(data, list): + msg = "failed to list slots for bulk action" + raise RuntimeError(msg) + + results: list[dict[str, Any]] = [] + summary: dict[str, Any] = { + "action": action, + "matched": 0, + "attempted": 0, + "succeeded": 0, + "failed": 0, + "skipped": 0, + "results": results, + } + + for slot in data: + slot_id = str(slot.get("slot_id", "")) + state = str(slot.get("state", "")) + if not slot_id: + continue + + if state not in eligible_states: + summary["skipped"] += 1 + results.append( + { + "slot_id": slot_id, + "state": state, + "result": "skipped", + "reason": "ineligible_state", + } + ) + continue + + summary["matched"] += 1 + summary["attempted"] += 1 + try: + action_status, action_data = _uds_request( + socket_path, + "POST", + action_path, + body={"slot_id": slot_id}, + ) + except OSError as err: + summary["failed"] += 1 + results.append( + { + "slot_id": slot_id, + "state": state, + "result": "failed", + "error": str(err), + } + ) + continue + + if 200 <= action_status < 300: + summary["succeeded"] += 1 + results.append( + { + "slot_id": slot_id, + "state": state, + "result": "ok", + } + ) + else: + summary["failed"] += 1 + results.append( + { + "slot_id": slot_id, + "state": state, + "result": "failed", + "status": action_status, + "response": action_data, + } + ) + + return summary + + +def _parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(prog="autoscalerctl", description="Autoscaler CLI") parser.add_argument( "--socket", default="/run/nix-builder-autoscaler/daemon.sock", help="Daemon Unix socket path", ) + parser.add_argument( + "--json", + action="store_true", + help="Output JSON for status command.", + ) subparsers = parser.add_subparsers(dest="command") subparsers.add_parser("status", help="Show state summary") subparsers.add_parser("slots", help="List slots") @@ -110,8 +228,14 @@ def _parse_args() -> argparse.Namespace: parser_drain.add_argument("slot_id") parser_unq = subparsers.add_parser("unquarantine", help="Unquarantine a slot") parser_unq.add_argument("slot_id") + subparsers.add_parser("drain-all", help="Drain all eligible slots (state=ready)") + subparsers.add_parser("unquarantine-all", help="Unquarantine all error slots") subparsers.add_parser("reconcile-now", help="Trigger immediate reconcile tick") - return parser.parse_args() + args = parser.parse_args(argv) + if not args.command: + parser.print_help() + raise SystemExit(0) + return args def _print_error(data: object) -> None: @@ -124,8 +248,19 @@ def _print_error(data: object) -> None: def main() -> None: """Entry point for the autoscalerctl CLI.""" args = _parse_args() - if not args.command: - raise SystemExit(1) + + if args.command in {"drain-all", "unquarantine-all"}: + action = "drain" if args.command == "drain-all" else "unquarantine" + try: + summary = _bulk_slot_action(args.socket, action) + except OSError as err: + print(f"Error: cannot connect to daemon at {args.socket}") + raise SystemExit(1) from err + except RuntimeError as err: + print(str(err)) + raise SystemExit(1) from err + print(json.dumps(summary, indent=2)) + raise SystemExit(0 if summary["failed"] == 0 else 1) method = "GET" path = "" @@ -160,7 +295,15 @@ def main() -> None: _print_error(data) raise SystemExit(1) - if args.command in {"status", "drain", "unquarantine", "reconcile-now"}: + if args.command == "status": + if not isinstance(data, dict): + _print_error(data) + raise SystemExit(1) + if args.json: + print(json.dumps(data, indent=2)) + else: + _print_status_summary(data) + elif args.command in {"drain", "unquarantine", "reconcile-now"}: print(json.dumps(data, indent=2)) elif args.command == "slots": if isinstance(data, list): diff --git a/agent/nix_builder_autoscaler/tests/test_cli.py b/agent/nix_builder_autoscaler/tests/test_cli.py new file mode 100644 index 0000000..2440b7d --- /dev/null +++ b/agent/nix_builder_autoscaler/tests/test_cli.py @@ -0,0 +1,97 @@ +"""Unit tests for autoscalerctl CLI argument and display behavior.""" + +from __future__ import annotations + +import pytest + +from nix_builder_autoscaler import cli +from nix_builder_autoscaler.cli import _parse_args, _print_status_summary + + +def test_parse_args_without_command_prints_help_and_exits_zero( + capsys: pytest.CaptureFixture[str], +) -> None: + with pytest.raises(SystemExit) as exc: + _parse_args([]) + assert exc.value.code == 0 + captured = capsys.readouterr() + assert "Autoscaler CLI" in captured.out + assert "status" in captured.out + + +def test_parse_args_json_status() -> None: + args = _parse_args(["--json", "status"]) + assert args.command == "status" + assert args.json is True + + +def test_parse_args_bulk_commands() -> None: + assert _parse_args(["drain-all"]).command == "drain-all" + assert _parse_args(["unquarantine-all"]).command == "unquarantine-all" + + +def test_print_status_summary_renders_metrics_table(capsys: pytest.CaptureFixture[str]) -> None: + _print_status_summary( + { + "slots": { + "total": 4, + "ready": 1, + "launching": 1, + "booting": 1, + "binding": 0, + "terminating": 0, + "empty": 1, + "error": 0, + }, + "reservations": {"pending": 2, "ready": 1, "failed": 0}, + "ec2": {"api_ok": True}, + "haproxy": {"socket_ok": True}, + } + ) + out = capsys.readouterr().out + assert "metric" in out + assert "slots.total" in out + assert "reservations.pending" in out + assert "haproxy.socket_ok" in out + + +def test_bulk_drain_only_targets_ready_slots(monkeypatch: pytest.MonkeyPatch) -> None: + def _fake_request(socket_path: str, method: str, path: str, body=None): # noqa: ANN001 + assert socket_path == "/tmp/sock" + if method == "GET" and path == "/v1/slots": + return 200, [ + {"slot_id": "slot001", "state": "ready"}, + {"slot_id": "slot002", "state": "booting"}, + ] + if method == "POST" and path == "/v1/admin/drain" and body == {"slot_id": "slot001"}: + return 200, {"state": "draining"} + raise AssertionError(f"unexpected request: {method} {path} {body}") + + monkeypatch.setattr(cli, "_uds_request", _fake_request) + summary = cli._bulk_slot_action("/tmp/sock", "drain") + assert summary["matched"] == 1 + assert summary["attempted"] == 1 + assert summary["succeeded"] == 1 + assert summary["failed"] == 0 + assert summary["skipped"] == 1 + + +def test_bulk_unquarantine_only_targets_error_slots(monkeypatch: pytest.MonkeyPatch) -> None: + def _fake_request(socket_path: str, method: str, path: str, body=None): # noqa: ANN001 + assert socket_path == "/tmp/sock" + if method == "GET" and path == "/v1/slots": + return 200, [ + {"slot_id": "slot001", "state": "error"}, + {"slot_id": "slot002", "state": "ready"}, + ] + if method == "POST" and path == "/v1/admin/unquarantine" and body == {"slot_id": "slot001"}: + return 200, {"state": "empty"} + raise AssertionError(f"unexpected request: {method} {path} {body}") + + monkeypatch.setattr(cli, "_uds_request", _fake_request) + summary = cli._bulk_slot_action("/tmp/sock", "unquarantine") + assert summary["matched"] == 1 + assert summary["attempted"] == 1 + assert summary["succeeded"] == 1 + assert summary["failed"] == 0 + assert summary["skipped"] == 1 diff --git a/flake.nix b/flake.nix index a5f8570..3293071 100644 --- a/flake.nix +++ b/flake.nix @@ -62,7 +62,8 @@ ); mkBuildbotAutoscaleExtPythonPackagesExtension = pkgs: pyFinal: _pyPrev: { buildbot-autoscale-ext = - pyFinal.toPythonModule (mkBuildbotExtPythonSet pkgs pyFinal.python)."buildbot-autoscale-ext"; + pyFinal.toPythonModule + (mkBuildbotExtPythonSet pkgs pyFinal.python)."buildbot-autoscale-ext"; }; in { @@ -94,8 +95,7 @@ pyprojectOverrides ] ); - buildbotExtPythonSet = - mkBuildbotExtPythonSet pkgs pkgs.python3; + buildbotExtPythonSet = mkBuildbotExtPythonSet pkgs pkgs.python3; venv = agentPythonSet.mkVirtualEnv "nix-builder-autoscaler-env" agentWorkspace.deps.default; buildbotExtVenv = buildbotExtPythonSet.mkVirtualEnv "buildbot-autoscale-ext-env" buildbotExtWorkspace.deps.default; in @@ -145,8 +145,7 @@ pyprojectOverrides ] ); - buildbotExtPythonSet = - mkBuildbotExtPythonSet pkgs pkgs.python3; + buildbotExtPythonSet = mkBuildbotExtPythonSet pkgs pkgs.python3; testVenv = agentPythonSet.mkVirtualEnv "nix-builder-autoscaler-test-env" { nix-builder-autoscaler = [ "dev" ]; }; diff --git a/nix/modules/nixos/services/nix-builder-autoscaler.nix b/nix/modules/nixos/services/nix-builder-autoscaler.nix index 803b7a3..7697ac5 100644 --- a/nix/modules/nixos/services/nix-builder-autoscaler.nix +++ b/nix/modules/nixos/services/nix-builder-autoscaler.nix @@ -275,7 +275,7 @@ in backend ${cfg.haproxy.backend} balance leastconn option tcp-check - tcp-check expect rstring SSH-2\\.0-OpenSSH.* + tcp-check expect rstring SSH-2\\.0-.* ${lib.concatMapStrings ( i: "server ${cfg.haproxy.slotPrefix}${lib.fixedWidthNumber 3 i} 127.0.0.2:22 disabled check inter 5s fall 2 rise 2 maxconn 2\n "