diff --git a/agent/nix_builder_autoscaler/__main__.py b/agent/nix_builder_autoscaler/__main__.py index 8a32dbc..6460998 100644 --- a/agent/nix_builder_autoscaler/__main__.py +++ b/agent/nix_builder_autoscaler/__main__.py @@ -127,6 +127,7 @@ def main() -> None: ) runtime = EC2Runtime(config.aws) + runtime.preflight_validate() haproxy = HAProxyRuntime( config.haproxy.runtime_socket, config.haproxy.backend, diff --git a/agent/nix_builder_autoscaler/config.py b/agent/nix_builder_autoscaler/config.py index deb0b0a..94203bd 100644 --- a/agent/nix_builder_autoscaler/config.py +++ b/agent/nix_builder_autoscaler/config.py @@ -22,6 +22,7 @@ class AwsConfig: """[aws] section.""" region: str = "us-east-1" + instance_type: str = "" launch_template_id: str = "" on_demand_launch_template_id: str = "" subnet_ids: list[str] = field(default_factory=list) diff --git a/agent/nix_builder_autoscaler/runtime/ec2.py b/agent/nix_builder_autoscaler/runtime/ec2.py index f04bbfb..9be3bee 100644 --- a/agent/nix_builder_autoscaler/runtime/ec2.py +++ b/agent/nix_builder_autoscaler/runtime/ec2.py @@ -59,6 +59,7 @@ class EC2Runtime(RuntimeAdapter): _tailscale_socket_path: str = "/run/tailscale/tailscaled.sock", ) -> None: self._client: Any = _client or boto3.client("ec2", region_name=config.region) + self._instance_type = config.instance_type self._launch_template_id = config.launch_template_id self._on_demand_launch_template_id = config.on_demand_launch_template_id self._subnet_ids = list(config.subnet_ids) @@ -68,6 +69,79 @@ class EC2Runtime(RuntimeAdapter): self._subnet_index = 0 self._tailscale_socket_path = _tailscale_socket_path + def preflight_validate(self) -> None: + """Check that the configured instance type is available in the configured subnets' AZs. + + Logs a clear error if the instance type is absent from the region or missing from + any subnet AZ so misconfigurations are surfaced at startup rather than discovered + silently on every failed launch attempt. Never raises; API failures are logged as + warnings so a transient permissions issue does not prevent startup. + """ + if not self._instance_type: + return + + try: + target_azs: set[str] = set() + if self._subnet_ids: + resp = self._client.describe_subnets(SubnetIds=self._subnet_ids) + for subnet in resp.get("Subnets", []): + az = subnet.get("AvailabilityZone") + if az: + target_azs.add(az) + + filters: list[dict[str, Any]] = [ + {"Name": "instance-type", "Values": [self._instance_type]}, + ] + if target_azs: + filters.append({"Name": "location", "Values": list(target_azs)}) + + resp = self._client.describe_instance_type_offerings( + LocationType="availability-zone", + Filters=filters, + ) + available_azs = {o["Location"] for o in resp.get("InstanceTypeOfferings", [])} + + if not available_azs: + region = self._client.meta.region_name + log.error( + "preflight_misconfiguration", + extra={ + "error": ( + f"instance type {self._instance_type!r} is not available in" + f" region {region!r} - all launches will fail with Unsupported" + ), + "category": "misconfiguration", + }, + ) + return + + missing_azs = target_azs - available_azs + if missing_azs: + log.warning( + "preflight_misconfiguration", + extra={ + "error": ( + f"instance type {self._instance_type!r} is not available in" + f" AZs {sorted(missing_azs)} - launches into those subnets will" + f" fail with Unsupported" + ), + "category": "misconfiguration", + }, + ) + else: + log.info( + "preflight_ok", + extra={ + "error": None, + "category": None, + }, + ) + except Exception as exc: + log.warning( + "preflight_validate_failed", + extra={"error": str(exc), "category": "unknown"}, + ) + def launch_instance( self, slot_id: str, user_data: str, *, nested_virtualization: bool = False ) -> str: diff --git a/nix/modules/nixos/services/nix-builder-autoscaler.nix b/nix/modules/nixos/services/nix-builder-autoscaler.nix index 1fdcfe6..2094b79 100644 --- a/nix/modules/nixos/services/nix-builder-autoscaler.nix +++ b/nix/modules/nixos/services/nix-builder-autoscaler.nix @@ -63,6 +63,12 @@ in description = "AWS region for EC2 launches."; }; + instanceType = lib.mkOption { + type = lib.types.str; + default = ""; + description = "EC2 instance type for nix builders. Used for preflight availability validation at startup."; + }; + launchTemplateIdFile = lib.mkOption { type = lib.types.nullOr lib.types.str; default = null; @@ -329,6 +335,7 @@ in [aws] region = "${cfg.aws.region}" + ${lib.optionalString (cfg.aws.instanceType != "") ''instance_type = "${cfg.aws.instanceType}"''} launch_template_id = "$launch_template_id" ${lib.optionalString ( cfg.aws.onDemandLaunchTemplateIdFile != null