nix-builder-autoscaler/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py
Abel Luck 02b1a063ab
Some checks failed
buildbot/nix-eval Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-nix-builder-autoscaler Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-default Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-autoscalerctl Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-default Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-nix-builder-autoscaler Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-pyright Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-integration-tests Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-ruff Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-unit-tests Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-buildbot-autoscale-ext Build done.
buildbot/nix-build Build done.
support dual launch templates: spot for normal builds, on-demand for nested virtualization
AWS does not allow cpu_options.nested_virtualization with spot instances. Add a second
launch template (on-demand, cpu_options enabled) alongside the existing spot template.
The autoscaler selects the template per-system based on nested_virtualization config.

- RuntimeAdapter.launch_spot -> launch_instance(nested_virtualization=False)
- EC2Runtime: selects spot or on-demand LT; raises misconfiguration error if
  on_demand_launch_template_id is empty when nested_virtualization=True
- AwsConfig: add on_demand_launch_template_id field
- SystemConfig: add nested_virtualization field
- Scheduler: looks up system config to pass nested_virtualization flag
- NixOS module: new aws.onDemandLaunchTemplateIdFile + capacity.nestedVirtualization
  options; assertion prevents enabling nestedVirtualization without the LT ID file
2026-02-28 10:33:26 +01:00

464 lines
16 KiB
Python

"""Unit tests for the EC2 runtime adapter using botocore Stubber."""
from datetime import UTC, datetime
from unittest.mock import patch
import boto3
import pytest
from botocore.stub import Stubber
from nix_builder_autoscaler.config import AwsConfig
from nix_builder_autoscaler.runtime.base import RuntimeError as RuntimeAdapterError
from nix_builder_autoscaler.runtime.ec2 import EC2Runtime
def _make_config():
return AwsConfig(
region="us-east-1",
launch_template_id="lt-abc123",
subnet_ids=["subnet-aaa", "subnet-bbb"],
security_group_ids=["sg-111"],
instance_profile_arn="arn:aws:iam::123456789012:instance-profile/nix-builder",
)
def _make_runtime(stubber, ec2_client, **kwargs):
config = kwargs.pop("config", _make_config())
environment = kwargs.pop("environment", "dev")
stubber.activate()
return EC2Runtime(config, environment=environment, _client=ec2_client)
class TestLaunchSpot:
def test_correct_params_and_returns_instance_id(self):
config = _make_config()
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
expected_params = {
"MinCount": 1,
"MaxCount": 1,
"LaunchTemplate": {
"LaunchTemplateId": "lt-abc123",
"Version": "$Latest",
},
"InstanceMarketOptions": {
"MarketType": "spot",
"SpotOptions": {
"SpotInstanceType": "one-time",
"InstanceInterruptionBehavior": "terminate",
},
},
"SubnetId": "subnet-aaa",
"UserData": "#!/bin/bash\necho hello",
"TagSpecifications": [
{
"ResourceType": "instance",
"Tags": [
{"Key": "Name", "Value": "nix-builder-slot001"},
{"Key": "AutoscalerSlot", "Value": "slot001"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
{"Key": "Service", "Value": "nix-builder"},
{"Key": "Environment", "Value": "dev"},
],
}
],
}
response = {
"Instances": [{"InstanceId": "i-12345678"}],
"OwnerId": "123456789012",
}
stubber.add_response("run_instances", response, expected_params)
runtime = _make_runtime(stubber, ec2_client, config=config)
iid = runtime.launch_instance("slot001", "#!/bin/bash\necho hello")
assert iid == "i-12345678"
stubber.assert_no_pending_responses()
def test_round_robin_subnets(self):
config = _make_config()
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# Two launches should use subnet-aaa then subnet-bbb
for _subnet in ["subnet-aaa", "subnet-bbb"]:
stubber.add_response(
"run_instances",
{"Instances": [{"InstanceId": "i-abc"}], "OwnerId": "123"},
)
runtime = _make_runtime(stubber, ec2_client, config=config)
runtime.launch_instance("slot001", "")
runtime.launch_instance("slot002", "")
stubber.assert_no_pending_responses()
class TestDescribeInstance:
def test_normalizes_response(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot001"},
],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["state"] == "running"
assert info["tailscale_ip"] is None
assert info["launch_time"] == launch_time.isoformat()
@patch.object(
EC2Runtime,
"_read_tailscale_status",
return_value={
"Peer": {
"peer1": {
"HostName": "nix-builder-slot001-i-running1",
"Online": True,
"TailscaleIPs": ["100.64.0.10"],
}
}
},
)
def test_discovers_tailscale_ip_from_localapi(self, _mock_status):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["tailscale_ip"] == "100.64.0.10"
@patch.object(EC2Runtime, "_read_tailscale_status", return_value={"Peer": {}})
def test_discovery_unavailable_returns_none(self, _mock_status):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["tailscale_ip"] is None
@patch.object(
EC2Runtime,
"_read_tailscale_status",
return_value={
"Peer": {
"peer1": {
"HostName": "nix-builder-slot001-old",
"Online": True,
"TailscaleIPs": ["100.64.0.10"],
},
"peer2": {
"HostName": "nix-builder-slot001-new",
"Online": True,
"TailscaleIPs": ["100.64.0.11"],
},
}
},
)
def test_ambiguous_slot_match_returns_none(self, _mock_status):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["tailscale_ip"] is None
@patch.object(
EC2Runtime,
"_read_tailscale_status",
return_value={
"Peer": {
"stale": {
"HostName": "nix-builder-slot001",
"Online": False,
"Active": True,
"TailscaleIPs": ["100.64.0.10"],
},
"current": {
"HostName": "nix-builder-slot001",
"Online": True,
"Active": False,
"TailscaleIPs": ["100.64.0.11"],
},
}
},
)
def test_ignores_active_but_offline_stale_peer(self, _mock_status):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-running1",
"State": {"Code": 16, "Name": "running"},
"LaunchTime": launch_time,
"Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
}
],
}
],
}
stubber.add_response(
"describe_instances",
response,
{"InstanceIds": ["i-running1"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-running1")
assert info["tailscale_ip"] == "100.64.0.11"
def test_localapi_permission_error_returns_none(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
runtime = EC2Runtime(_make_config(), _client=ec2_client)
with patch(
"nix_builder_autoscaler.runtime.ec2._UnixSocketHTTPConnection.connect",
side_effect=PermissionError,
):
assert runtime._read_tailscale_status() is None
def test_missing_instance_returns_terminated(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
stubber.add_response(
"describe_instances",
{"Reservations": []},
{"InstanceIds": ["i-gone"]},
)
runtime = _make_runtime(stubber, ec2_client)
info = runtime.describe_instance("i-gone")
assert info["state"] == "terminated"
assert info["tailscale_ip"] is None
assert info["launch_time"] is None
class TestListManagedInstances:
def test_filters_by_tag(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
expected_params = {
"Filters": [
{"Name": "tag:ManagedBy", "Values": ["nix-builder-autoscaler"]},
{
"Name": "instance-state-name",
"Values": ["pending", "running", "shutting-down", "stopping"],
},
],
}
response = {
"Reservations": [
{
"Instances": [
{
"InstanceId": "i-aaa",
"State": {"Code": 16, "Name": "running"},
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot001"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
],
},
{
"InstanceId": "i-bbb",
"State": {"Code": 0, "Name": "pending"},
"Tags": [
{"Key": "AutoscalerSlot", "Value": "slot002"},
{"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
],
},
],
}
],
}
stubber.add_response("describe_instances", response, expected_params)
runtime = _make_runtime(stubber, ec2_client)
managed = runtime.list_managed_instances()
assert len(managed) == 2
assert managed[0]["instance_id"] == "i-aaa"
assert managed[0]["state"] == "running"
assert managed[0]["slot_id"] == "slot001"
assert managed[1]["instance_id"] == "i-bbb"
assert managed[1]["state"] == "pending"
assert managed[1]["slot_id"] == "slot002"
class TestTerminateInstance:
def test_calls_terminate_api(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
response = {
"TerminatingInstances": [
{
"InstanceId": "i-kill",
"CurrentState": {"Code": 32, "Name": "shutting-down"},
"PreviousState": {"Code": 16, "Name": "running"},
}
],
}
stubber.add_response(
"terminate_instances",
response,
{"InstanceIds": ["i-kill"]},
)
runtime = _make_runtime(stubber, ec2_client)
# Should not raise
runtime.terminate_instance("i-kill")
stubber.assert_no_pending_responses()
class TestErrorClassification:
def test_insufficient_capacity_classified(self):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
stubber.add_client_error(
"run_instances",
service_error_code="InsufficientInstanceCapacity",
service_message="Insufficient capacity",
)
runtime = _make_runtime(stubber, ec2_client)
with pytest.raises(RuntimeAdapterError) as exc_info:
runtime.launch_instance("slot001", "#!/bin/bash")
assert exc_info.value.category == "capacity_unavailable"
@patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
def test_request_limit_exceeded_retried(self, mock_sleep):
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# First call: throttled
stubber.add_client_error(
"run_instances",
service_error_code="RequestLimitExceeded",
service_message="Rate exceeded",
)
# Second call: success
stubber.add_response(
"run_instances",
{"Instances": [{"InstanceId": "i-retry123"}], "OwnerId": "123"},
)
runtime = _make_runtime(stubber, ec2_client)
iid = runtime.launch_instance("slot001", "#!/bin/bash")
assert iid == "i-retry123"
assert mock_sleep.called
stubber.assert_no_pending_responses()
@patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
def test_request_limit_exceeded_exhausted(self, mock_sleep):
"""After max retries, RequestLimitExceeded raises with 'throttled' category."""
ec2_client = boto3.client("ec2", region_name="us-east-1")
stubber = Stubber(ec2_client)
# 4 errors (max_retries=3: attempt 0,1,2,3 all fail)
for _ in range(4):
stubber.add_client_error(
"run_instances",
service_error_code="RequestLimitExceeded",
service_message="Rate exceeded",
)
runtime = _make_runtime(stubber, ec2_client)
with pytest.raises(RuntimeAdapterError) as exc_info:
runtime.launch_instance("slot001", "#!/bin/bash")
assert exc_info.value.category == "throttled"