add runtime adapters, scheduler, reconciler, and their unit tests

This commit is contained in:
Abel Luck 2026-02-27 12:34:32 +01:00
parent d1976a5fd8
commit b63d69c81d
10 changed files with 1471 additions and 28 deletions

View file

@ -1 +1,148 @@
"""HAProxy provider unit tests — Plan 02."""
"""Unit tests for the HAProxy provider, mocking at socket level."""
from unittest.mock import MagicMock, patch
import pytest
from nix_builder_autoscaler.providers.haproxy import HAProxyError, HAProxyRuntime
# HAProxy `show stat` CSV — trimmed to columns the parser uses.
# Full output has many more columns; we keep through `status` (col 17).
SHOW_STAT_CSV = (
"# pxname,svname,qcur,qmax,scur,smax,slim,stot,"
"bin,bout,dreq,dresp,ereq,econ,eresp,wretr,wredis,status\n"
"all,BACKEND,0,0,2,5,200,100,5000,6000,0,0,,0,0,0,0,UP\n"
"all,slot001,0,0,1,3,50,50,2500,3000,0,0,,0,0,0,0,UP\n"
"all,slot002,0,0,1,2,50,50,2500,3000,0,0,,0,0,0,0,DOWN\n"
"all,slot003,0,0,0,0,50,0,0,0,0,0,,0,0,0,0,MAINT\n"
)
class TestSetSlotAddr:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_sends_correct_command(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [b"\n", b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
h.set_slot_addr("slot001", "100.64.0.1", 22)
mock_sock.connect.assert_called_once_with("/tmp/test.sock")
mock_sock.sendall.assert_called_once_with(
b"set server all/slot001 addr 100.64.0.1 port 22\n"
)
class TestEnableSlot:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_sends_correct_command(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [b"\n", b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
h.enable_slot("slot001")
mock_sock.sendall.assert_called_once_with(b"enable server all/slot001\n")
class TestDisableSlot:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_sends_correct_command(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [b"\n", b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
h.disable_slot("slot001")
mock_sock.sendall.assert_called_once_with(b"disable server all/slot001\n")
class TestReadSlotHealth:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_parses_csv_correctly(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [SHOW_STAT_CSV.encode(), b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
health = h.read_slot_health()
assert len(health) == 3
# BACKEND row should be excluded (svname "BACKEND" doesn't start with "slot")
assert health["slot001"].status == "UP"
assert health["slot001"].scur == 1
assert health["slot001"].qcur == 0
assert health["slot002"].status == "DOWN"
assert health["slot002"].scur == 1
assert health["slot002"].qcur == 0
assert health["slot003"].status == "MAINT"
assert health["slot003"].scur == 0
assert health["slot003"].qcur == 0
class TestSlotIsUp:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_returns_true_for_up_slot(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [SHOW_STAT_CSV.encode(), b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
assert h.slot_is_up("slot001") is True
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_returns_false_for_down_slot(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [SHOW_STAT_CSV.encode(), b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
assert h.slot_is_up("slot002") is False
class TestErrorHandling:
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_unrecognized_slot_raises_haproxy_error(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [b"No such server.\n", b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
with pytest.raises(HAProxyError, match="No such server"):
h.set_slot_addr("slot999", "100.64.0.1")
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_socket_not_found_raises_haproxy_error(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.connect.side_effect = FileNotFoundError("No such file")
h = HAProxyRuntime("/tmp/nonexistent.sock", "all", "slot")
with pytest.raises(HAProxyError, match="socket not found"):
h.set_slot_addr("slot001", "100.64.0.1")
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_connection_refused_raises_haproxy_error(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.connect.side_effect = ConnectionRefusedError("Connection refused")
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
with pytest.raises(HAProxyError, match="Connection refused"):
h.enable_slot("slot001")
@patch("nix_builder_autoscaler.providers.haproxy.socket.socket")
def test_slot_session_count_missing_slot_raises(self, mock_socket_cls):
mock_sock = MagicMock()
mock_socket_cls.return_value = mock_sock
mock_sock.recv.side_effect = [SHOW_STAT_CSV.encode(), b""]
h = HAProxyRuntime("/tmp/test.sock", "all", "slot")
with pytest.raises(HAProxyError, match="Slot not found"):
h.slot_session_count("slot_nonexistent")

View file

@ -1 +1,286 @@
"""EC2 runtime unit tests — Plan 02."""
"""Unit tests for the EC2 runtime adapter using botocore Stubber."""
from datetime import UTC, datetime
from unittest.mock import patch
import boto3
import pytest
from botocore.stub import Stubber
from nix_builder_autoscaler.config import AwsConfig
from nix_builder_autoscaler.runtime.base import RuntimeError as RuntimeAdapterError
from nix_builder_autoscaler.runtime.ec2 import EC2Runtime
def _make_config():
return AwsConfig(
region="us-east-1",
launch_template_id="lt-abc123",
subnet_ids=["subnet-aaa", "subnet-bbb"],
security_group_ids=["sg-111"],
instance_profile_arn="arn:aws:iam::123456789012:instance-profile/nix-builder",
)
def _make_runtime(stubber, ec2_client, **kwargs):
config = kwargs.pop("config", _make_config())
environment = kwargs.pop("environment", "dev")
stubber.activate()
return EC2Runtime(config, environment=environment, _client=ec2_client)
class TestLaunchSpot:
def test_correct_params_and_returns_instance_id(self):
config = _make_config()
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
expected_params = {
"MinCount": 1,
"MaxCount": 1,
"LaunchTemplate": {
"LaunchTemplateId": "lt-abc123",
"Version": "$Latest",
},
"InstanceMarketOptions": {
"MarketType": "spot",
"SpotOptions": {
"SpotInstanceType": "one-time",
"InstanceInterruptionBehavior": "terminate",
},
},
"SubnetId": "subnet-aaa",
"UserData": "#!/bin/bash\necho hello",
"TagSpecifications": [
{
"ResourceType": "instance",
"Tags": [
{"Key": "Name", "Value": "nix-builder-slot001"},
{"Key": "AutoscalerSlot", "Value": "slot001"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
{"Key": "Service", "Value": "nix-builder"},
{"Key": "Environment", "Value": "dev"},
],
}
],
}
response = {
"Instances": [{"InstanceId": "i-12345678"}],
"OwnerId": "123456789012",
}
stubber.add_response("run_instances", response, expected_params)
runtime = _make_runtime(stubber, ec2_client, config=config)
iid = runtime.launch_spot("slot001", "#!/bin/bash\necho hello")
assert iid == "i-12345678"
stubber.assert_no_pending_responses()
def test_round_robin_subnets(self):
config = _make_config()
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# Two launches should use subnet-aaa then subnet-bbb
for _subnet in ["subnet-aaa", "subnet-bbb"]:
stubber.add_response(
"run_instances",
{"Instances": [{"InstanceId": "i-abc"}], "OwnerId": "123"},
)
runtime = _make_runtime(stubber, ec2_client, config=config)
runtime.launch_spot("slot001", "")
runtime.launch_spot("slot002", "")
stubber.assert_no_pending_responses()
class TestDescribeInstance:
def test_normalizes_response(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot001"},
],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["state"] == "running"
assert info["tailscale_ip"] is None
assert info["launch_time"] == launch_time.isoformat()
def test_missing_instance_returns_terminated(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
stubber.add_response(
"describe_instances",
{"Reservations": []},
{"InstanceIds": ["i-gone"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-gone")
assert info["state"] == "terminated"
assert info["tailscale_ip"] is None
assert info["launch_time"] is None
class TestListManagedInstances:
def test_filters_by_tag(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
expected_params = {
"Filters": [
{"Name": "tag:ManagedBy", "Values": ["nix-builder-autoscaler"]},
{
"Name": "instance-state-name",
"Values": ["pending", "running", "shutting-down", "stopping"],
},
],
}
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-aaa",
"State": {"Code": 16, "Name": "running"},
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot001"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
],
},
{
"InstanceId": "i-bbb",
"State": {"Code": 0, "Name": "pending"},
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot002"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
],
},
],
}
],
}
stubber.add_response("describe_instances", response, expected_params)
runtime = _make_runtime(stubber, ec2_client)
managed = runtime.list_managed_instances()
assert len(managed) == 2
assert managed[0]["instance_id"] == "i-aaa"
assert managed[0]["state"] == "running"
assert managed[0]["slot_id"] == "slot001"
assert managed[1]["instance_id"] == "i-bbb"
assert managed[1]["state"] == "pending"
assert managed[1]["slot_id"] == "slot002"
class TestTerminateInstance:
def test_calls_terminate_api(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
response = {
"TerminatingInstances": [
{
"InstanceId": "i-kill",
"CurrentState": {"Code": 32, "Name": "shutting-down"},
"PreviousState": {"Code": 16, "Name": "running"},
}
],
}
stubber.add_response(
"terminate_instances",
response,
{"InstanceIds": ["i-kill"]},
)
runtime = _make_runtime(stubber, ec2_client)
# Should not raise
runtime.terminate_instance("i-kill")
stubber.assert_no_pending_responses()
class TestErrorClassification:
def test_insufficient_capacity_classified(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
stubber.add_client_error(
"run_instances",
service_error_code="InsufficientInstanceCapacity",
service_message="Insufficient capacity",
)
runtime = _make_runtime(stubber, ec2_client)
with pytest.raises(RuntimeAdapterError) as exc_info:
runtime.launch_spot("slot001", "#!/bin/bash")
assert exc_info.value.category == "capacity_unavailable"
@patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
def test_request_limit_exceeded_retried(self, mock_sleep):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# First call: throttled
stubber.add_client_error(
"run_instances",
service_error_code="RequestLimitExceeded",
service_message="Rate exceeded",
)
# Second call: success
stubber.add_response(
"run_instances",
{"Instances": [{"InstanceId": "i-retry123"}], "OwnerId": "123"},
)
runtime = _make_runtime(stubber, ec2_client)
iid = runtime.launch_spot("slot001", "#!/bin/bash")
assert iid == "i-retry123"
assert mock_sleep.called
stubber.assert_no_pending_responses()
@patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
def test_request_limit_exceeded_exhausted(self, mock_sleep):
"""After max retries, RequestLimitExceeded raises with 'throttled' category."""
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# 4 errors (max_retries=3: attempt 0,1,2,3 all fail)
for _ in range(4):
stubber.add_client_error(
"run_instances",
service_error_code="RequestLimitExceeded",
service_message="Rate exceeded",
)
runtime = _make_runtime(stubber, ec2_client)
with pytest.raises(RuntimeAdapterError) as exc_info:
runtime.launch_spot("slot001", "#!/bin/bash")
assert exc_info.value.category == "throttled"

View file

@ -1 +1,194 @@
"""Scheduler unit tests — Plan 03."""
from nix_builder_autoscaler.config import AppConfig, AwsConfig, CapacityConfig
from nix_builder_autoscaler.metrics import MetricsRegistry
from nix_builder_autoscaler.models import ReservationPhase, SlotState
from nix_builder_autoscaler.providers.clock import FakeClock
from nix_builder_autoscaler.runtime.fake import FakeRuntime
from nix_builder_autoscaler.scheduler import scheduling_tick
from nix_builder_autoscaler.state_db import StateDB
def _make_env(
slot_count=3,
max_slots=3,
max_leases=1,
idle_scale_down_seconds=900,
target_warm=0,
min_slots=0,
):
clock = FakeClock()
db = StateDB(":memory:", clock=clock)
db.init_schema()
db.init_slots("slot", slot_count, "x86_64-linux", "all")
runtime = FakeRuntime(launch_latency_ticks=2, ip_delay_ticks=1)
config = AppConfig(
capacity=CapacityConfig(
max_slots=max_slots,
max_leases_per_slot=max_leases,
idle_scale_down_seconds=idle_scale_down_seconds,
target_warm_slots=target_warm,
min_slots=min_slots,
reservation_ttl_seconds=1200,
),
aws=AwsConfig(region="us-east-1"),
)
metrics = MetricsRegistry()
return db, runtime, config, clock, metrics
def _make_slot_ready(db, slot_id, instance_id="i-test1", ip="100.64.0.1"):
"""Transition a slot through the full state machine to ready."""
db.update_slot_state(slot_id, SlotState.LAUNCHING, instance_id=instance_id)
db.update_slot_state(slot_id, SlotState.BOOTING)
db.update_slot_state(slot_id, SlotState.BINDING, instance_ip=ip)
db.update_slot_state(slot_id, SlotState.READY)
# --- Test cases ---
def test_pending_reservation_assigned_to_ready_slot():
db, runtime, config, clock, metrics = _make_env()
_make_slot_ready(db, "slot001")
resv = db.create_reservation("x86_64-linux", "test", None, 1200)
scheduling_tick(db, runtime, config, clock, metrics)
updated = db.get_reservation(resv["reservation_id"])
assert updated["phase"] == ReservationPhase.READY.value
assert updated["slot_id"] == "slot001"
assert updated["instance_id"] == "i-test1"
slot = db.get_slot("slot001")
assert slot["lease_count"] == 1
def test_two_pending_one_slot_only_one_assigned_per_tick():
db, runtime, config, clock, metrics = _make_env(max_leases=1)
_make_slot_ready(db, "slot001")
r1 = db.create_reservation("x86_64-linux", "test1", None, 1200)
r2 = db.create_reservation("x86_64-linux", "test2", None, 1200)
scheduling_tick(db, runtime, config, clock, metrics)
u1 = db.get_reservation(r1["reservation_id"])
u2 = db.get_reservation(r2["reservation_id"])
ready_count = sum(1 for r in [u1, u2] if r["phase"] == ReservationPhase.READY.value)
pending_count = sum(1 for r in [u1, u2] if r["phase"] == ReservationPhase.PENDING.value)
assert ready_count == 1
assert pending_count == 1
slot = db.get_slot("slot001")
assert slot["lease_count"] == 1
def test_reservation_expires_when_ttl_passes():
db, runtime, config, clock, metrics = _make_env()
config.capacity.reservation_ttl_seconds = 60
db.create_reservation("x86_64-linux", "test", None, 60)
clock.advance(61)
scheduling_tick(db, runtime, config, clock, metrics)
reservations = db.list_reservations(ReservationPhase.EXPIRED)
assert len(reservations) == 1
def test_scale_down_starts_when_idle_exceeds_threshold():
db, runtime, config, clock, metrics = _make_env(idle_scale_down_seconds=900)
_make_slot_ready(db, "slot001")
clock.advance(901)
scheduling_tick(db, runtime, config, clock, metrics)
slot = db.get_slot("slot001")
assert slot["state"] == SlotState.DRAINING.value
def test_slot_does_not_drain_while_lease_count_positive():
db, runtime, config, clock, metrics = _make_env(idle_scale_down_seconds=900)
_make_slot_ready(db, "slot001")
resv = db.create_reservation("x86_64-linux", "test", None, 1200)
scheduling_tick(db, runtime, config, clock, metrics)
# Confirm assigned
updated = db.get_reservation(resv["reservation_id"])
assert updated["phase"] == ReservationPhase.READY.value
clock.advance(901)
scheduling_tick(db, runtime, config, clock, metrics)
slot = db.get_slot("slot001")
assert slot["state"] == SlotState.READY.value
def test_interruption_pending_slot_moves_to_draining():
db, runtime, config, clock, metrics = _make_env()
_make_slot_ready(db, "slot001")
db.update_slot_fields("slot001", interruption_pending=1)
scheduling_tick(db, runtime, config, clock, metrics)
slot = db.get_slot("slot001")
assert slot["state"] == SlotState.DRAINING.value
assert slot["interruption_pending"] == 0
def test_launch_triggered_for_unmet_demand():
db, runtime, config, clock, metrics = _make_env()
db.create_reservation("x86_64-linux", "test", None, 1200)
scheduling_tick(db, runtime, config, clock, metrics)
launching = db.list_slots(SlotState.LAUNCHING)
assert len(launching) == 1
assert launching[0]["instance_id"] is not None
# FakeRuntime should have one pending instance
managed = runtime.list_managed_instances()
assert len(managed) == 1
def test_launch_respects_max_slots():
db, runtime, config, clock, metrics = _make_env(max_slots=1)
_make_slot_ready(db, "slot001")
# Slot001 is at capacity (lease_count will be 1 after assignment)
db.create_reservation("x86_64-linux", "test1", None, 1200)
db.create_reservation("x86_64-linux", "test2", None, 1200)
scheduling_tick(db, runtime, config, clock, metrics)
# One reservation assigned, one still pending — but no new launch
# because active_slots (1) == max_slots (1)
launching = db.list_slots(SlotState.LAUNCHING)
assert len(launching) == 0
def test_min_slots_maintained():
db, runtime, config, clock, metrics = _make_env(min_slots=1)
# No reservations, all slots empty
scheduling_tick(db, runtime, config, clock, metrics)
launching = db.list_slots(SlotState.LAUNCHING)
assert len(launching) == 1
def test_scale_down_respects_min_slots():
db, runtime, config, clock, metrics = _make_env(min_slots=1, idle_scale_down_seconds=900)
_make_slot_ready(db, "slot001")
clock.advance(901)
scheduling_tick(db, runtime, config, clock, metrics)
slot = db.get_slot("slot001")
assert slot["state"] == SlotState.READY.value