nix-builder-autoscaler/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py

"""Unit tests for the EC2 runtime adapter using botocore Stubber."""

from datetime import UTC, datetime
from unittest.mock import patch

import boto3
import pytest
from botocore.stub import Stubber

from nix_builder_autoscaler.config import AwsConfig
from nix_builder_autoscaler.runtime.base import RuntimeError as RuntimeAdapterError
from nix_builder_autoscaler.runtime.ec2 import EC2Runtime


def _make_config():
    return AwsConfig(
        region="us-east-1",
        launch_template_id="lt-abc123",
        subnet_ids=["subnet-aaa", "subnet-bbb"],
        security_group_ids=["sg-111"],
        instance_profile_arn="arn:aws:iam::123456789012:instance-profile/nix-builder",
    )


def _make_runtime(stubber, ec2_client, **kwargs):
    config = kwargs.pop("config", _make_config())
    environment = kwargs.pop("environment", "dev")
    stubber.activate()
    return EC2Runtime(config, environment=environment, _client=ec2_client)


class TestLaunchSpot:
    def test_correct_params_and_returns_instance_id(self):
        config = _make_config()
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        expected_params = {
            "MinCount": 1,
            "MaxCount": 1,
            "LaunchTemplate": {
                "LaunchTemplateId": "lt-abc123",
                "Version": "$Latest",
            },
            "InstanceMarketOptions": {
                "MarketType": "spot",
                "SpotOptions": {
                    "SpotInstanceType": "one-time",
                    "InstanceInterruptionBehavior": "terminate",
                },
            },
            "SubnetId": "subnet-aaa",
            "UserData": "#!/bin/bash\necho hello",
            "TagSpecifications": [
                {
                    "ResourceType": "instance",
                    "Tags": [
                        {"Key": "Name", "Value": "nix-builder-slot001"},
                        {"Key": "AutoscalerSlot", "Value": "slot001"},
                        {"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
                        {"Key": "Service", "Value": "nix-builder"},
                        {"Key": "Environment", "Value": "dev"},
                    ],
                }
            ],
        }

        response = {
            "Instances": [{"InstanceId": "i-12345678"}],
            "OwnerId": "123456789012",
        }

        stubber.add_response("run_instances", response, expected_params)
        runtime = _make_runtime(stubber, ec2_client, config=config)

        iid = runtime.launch_instance("slot001", "#!/bin/bash\necho hello")
        assert iid == "i-12345678"
        stubber.assert_no_pending_responses()

    def test_round_robin_subnets(self):
        config = _make_config()
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        # Two launches should use subnet-aaa then subnet-bbb
        for _subnet in ["subnet-aaa", "subnet-bbb"]:
            stubber.add_response(
                "run_instances",
                {"Instances": [{"InstanceId": "i-abc"}], "OwnerId": "123"},
            )

        runtime = _make_runtime(stubber, ec2_client, config=config)
        runtime.launch_instance("slot001", "")
        runtime.launch_instance("slot002", "")
        stubber.assert_no_pending_responses()


class TestDescribeInstance:
    def test_normalizes_response(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-running1",
                            "State": {"Code": 16, "Name": "running"},
                            "LaunchTime": launch_time,
                            "Tags": [
                                {"Key": "AutoscalerSlot", "Value": "slot001"},
                            ],
                        }
                    ],
                }
            ],
        }

        stubber.add_response(
            "describe_instances",
            response,
            {"InstanceIds": ["i-running1"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-running1")
        assert info["state"] == "running"
        assert info["tailscale_ip"] is None
        assert info["launch_time"] == launch_time.isoformat()

    @patch.object(
        EC2Runtime,
        "_read_tailscale_status",
        return_value={
            "Peer": {
                "peer1": {
                    "HostName": "nix-builder-slot001-i-running1",
                    "Online": True,
                    "TailscaleIPs": ["100.64.0.10"],
                }
            }
        },
    )
    def test_discovers_tailscale_ip_from_localapi(self, _mock_status):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-running1",
                            "State": {"Code": 16, "Name": "running"},
                            "LaunchTime": launch_time,
                            "Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
                        }
                    ],
                }
            ],
        }
        stubber.add_response(
            "describe_instances",
            response,
            {"InstanceIds": ["i-running1"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-running1")
        assert info["tailscale_ip"] == "100.64.0.10"

    @patch.object(EC2Runtime, "_read_tailscale_status", return_value={"Peer": {}})
    def test_discovery_unavailable_returns_none(self, _mock_status):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-running1",
                            "State": {"Code": 16, "Name": "running"},
                            "LaunchTime": launch_time,
                            "Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
                        }
                    ],
                }
            ],
        }
        stubber.add_response(
            "describe_instances",
            response,
            {"InstanceIds": ["i-running1"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-running1")
        assert info["tailscale_ip"] is None

    @patch.object(
        EC2Runtime,
        "_read_tailscale_status",
        return_value={
            "Peer": {
                "peer1": {
                    "HostName": "nix-builder-slot001-old",
                    "Online": True,
                    "TailscaleIPs": ["100.64.0.10"],
                },
                "peer2": {
                    "HostName": "nix-builder-slot001-new",
                    "Online": True,
                    "TailscaleIPs": ["100.64.0.11"],
                },
            }
        },
    )
    def test_ambiguous_slot_match_returns_none(self, _mock_status):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-running1",
                            "State": {"Code": 16, "Name": "running"},
                            "LaunchTime": launch_time,
                            "Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
                        }
                    ],
                }
            ],
        }
        stubber.add_response(
            "describe_instances",
            response,
            {"InstanceIds": ["i-running1"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-running1")
        assert info["tailscale_ip"] is None

    @patch.object(
        EC2Runtime,
        "_read_tailscale_status",
        return_value={
            "Peer": {
                "stale": {
                    "HostName": "nix-builder-slot001",
                    "Online": False,
                    "Active": True,
                    "TailscaleIPs": ["100.64.0.10"],
                },
                "current": {
                    "HostName": "nix-builder-slot001",
                    "Online": True,
                    "Active": False,
                    "TailscaleIPs": ["100.64.0.11"],
                },
            }
        },
    )
    def test_ignores_active_but_offline_stale_peer(self, _mock_status):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        launch_time = datetime(2026, 1, 15, 12, 30, 0, tzinfo=UTC)
        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-running1",
                            "State": {"Code": 16, "Name": "running"},
                            "LaunchTime": launch_time,
                            "Tags": [{"Key": "AutoscalerSlot", "Value": "slot001"}],
                        }
                    ],
                }
            ],
        }
        stubber.add_response(
            "describe_instances",
            response,
            {"InstanceIds": ["i-running1"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-running1")
        assert info["tailscale_ip"] == "100.64.0.11"

    def test_localapi_permission_error_returns_none(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        runtime = EC2Runtime(_make_config(), _client=ec2_client)

        with patch(
            "nix_builder_autoscaler.runtime.ec2._UnixSocketHTTPConnection.connect",
            side_effect=PermissionError,
        ):
            assert runtime._read_tailscale_status() is None

    def test_missing_instance_returns_terminated(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        stubber.add_response(
            "describe_instances",
            {"Reservations": []},
            {"InstanceIds": ["i-gone"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        info = runtime.describe_instance("i-gone")
        assert info["state"] == "terminated"
        assert info["tailscale_ip"] is None
        assert info["launch_time"] is None


class TestListManagedInstances:
    def test_filters_by_tag(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        expected_params = {
            "Filters": [
                {"Name": "tag:ManagedBy", "Values": ["nix-builder-autoscaler"]},
                {
                    "Name": "instance-state-name",
                    "Values": ["pending", "running", "shutting-down", "stopping"],
                },
            ],
        }

        response = {
            "Reservations": [
                {
                    "Instances": [
                        {
                            "InstanceId": "i-aaa",
                            "State": {"Code": 16, "Name": "running"},
                            "Tags": [
                                {"Key": "AutoscalerSlot", "Value": "slot001"},
                                {"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
                            ],
                        },
                        {
                            "InstanceId": "i-bbb",
                            "State": {"Code": 0, "Name": "pending"},
                            "Tags": [
                                {"Key": "AutoscalerSlot", "Value": "slot002"},
                                {"Key": "ManagedBy", "Value": "nix-builder-autoscaler"},
                            ],
                        },
                    ],
                }
            ],
        }

        stubber.add_response("describe_instances", response, expected_params)
        runtime = _make_runtime(stubber, ec2_client)

        managed = runtime.list_managed_instances()
        assert len(managed) == 2
        assert managed[0]["instance_id"] == "i-aaa"
        assert managed[0]["state"] == "running"
        assert managed[0]["slot_id"] == "slot001"
        assert managed[1]["instance_id"] == "i-bbb"
        assert managed[1]["state"] == "pending"
        assert managed[1]["slot_id"] == "slot002"


class TestTerminateInstance:
    def test_calls_terminate_api(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        response = {
            "TerminatingInstances": [
                {
                    "InstanceId": "i-kill",
                    "CurrentState": {"Code": 32, "Name": "shutting-down"},
                    "PreviousState": {"Code": 16, "Name": "running"},
                }
            ],
        }

        stubber.add_response(
            "terminate_instances",
            response,
            {"InstanceIds": ["i-kill"]},
        )
        runtime = _make_runtime(stubber, ec2_client)

        # Should not raise
        runtime.terminate_instance("i-kill")
        stubber.assert_no_pending_responses()


class TestErrorClassification:
    def test_insufficient_capacity_classified(self):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        stubber.add_client_error(
            "run_instances",
            service_error_code="InsufficientInstanceCapacity",
            service_message="Insufficient capacity",
        )
        runtime = _make_runtime(stubber, ec2_client)

        with pytest.raises(RuntimeAdapterError) as exc_info:
            runtime.launch_instance("slot001", "#!/bin/bash")
        assert exc_info.value.category == "capacity_unavailable"

    @patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
    def test_request_limit_exceeded_retried(self, mock_sleep):
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        # First call: throttled
        stubber.add_client_error(
            "run_instances",
            service_error_code="RequestLimitExceeded",
            service_message="Rate exceeded",
        )
        # Second call: success
        stubber.add_response(
            "run_instances",
            {"Instances": [{"InstanceId": "i-retry123"}], "OwnerId": "123"},
        )
        runtime = _make_runtime(stubber, ec2_client)

        iid = runtime.launch_instance("slot001", "#!/bin/bash")
        assert iid == "i-retry123"
        assert mock_sleep.called
        stubber.assert_no_pending_responses()

    @patch("nix_builder_autoscaler.runtime.ec2.time.sleep")
    def test_request_limit_exceeded_exhausted(self, mock_sleep):
        """After max retries, RequestLimitExceeded raises with 'throttled' category."""
        ec2_client = boto3.client("ec2", region_name="us-east-1")
        stubber = Stubber(ec2_client)

        # 4 errors (max_retries=3: attempt 0,1,2,3 all fail)
        for _ in range(4):
            stubber.add_client_error(
                "run_instances",
                service_error_code="RequestLimitExceeded",
                service_message="Rate exceeded",
            )
        runtime = _make_runtime(stubber, ec2_client)

        with pytest.raises(RuntimeAdapterError) as exc_info:
            runtime.launch_instance("slot001", "#!/bin/bash")
        assert exc_info.value.category == "throttled"