autoscaler: preflight check instance type availability at startup

This commit is contained in:
Abel Luck 2026-02-28 16:07:53 +01:00
parent 02b1a063ab
commit 1a355429cb
4 changed files with 83 additions and 0 deletions

View file

@ -127,6 +127,7 @@ def main() -> None:
)
runtime = EC2Runtime(config.aws)
runtime.preflight_validate()
haproxy = HAProxyRuntime(
config.haproxy.runtime_socket,
config.haproxy.backend,

View file

@ -22,6 +22,7 @@ class AwsConfig:
"""[aws] section."""
region: str = "us-east-1"
instance_type: str = ""
launch_template_id: str = ""
on_demand_launch_template_id: str = ""
subnet_ids: list[str] = field(default_factory=list)

View file

@ -59,6 +59,7 @@ class EC2Runtime(RuntimeAdapter):
_tailscale_socket_path: str = "/run/tailscale/tailscaled.sock",
) -> None:
self._client: Any = _client or boto3.client("ec2", region_name=config.region)
self._instance_type = config.instance_type
self._launch_template_id = config.launch_template_id
self._on_demand_launch_template_id = config.on_demand_launch_template_id
self._subnet_ids = list(config.subnet_ids)
@ -68,6 +69,79 @@ class EC2Runtime(RuntimeAdapter):
self._subnet_index = 0
self._tailscale_socket_path = _tailscale_socket_path
def preflight_validate(self) -> None:
"""Check that the configured instance type is available in the configured subnets' AZs.
Logs a clear error if the instance type is absent from the region or missing from
any subnet AZ so misconfigurations are surfaced at startup rather than discovered
silently on every failed launch attempt. Never raises; API failures are logged as
warnings so a transient permissions issue does not prevent startup.
"""
if not self._instance_type:
return
try:
target_azs: set[str] = set()
if self._subnet_ids:
resp = self._client.describe_subnets(SubnetIds=self._subnet_ids)
for subnet in resp.get("Subnets", []):
az = subnet.get("AvailabilityZone")
if az:
target_azs.add(az)
filters: list[dict[str, Any]] = [
{"Name": "instance-type", "Values": [self._instance_type]},
]
if target_azs:
filters.append({"Name": "location", "Values": list(target_azs)})
resp = self._client.describe_instance_type_offerings(
LocationType="availability-zone",
Filters=filters,
)
available_azs = {o["Location"] for o in resp.get("InstanceTypeOfferings", [])}
if not available_azs:
region = self._client.meta.region_name
log.error(
"preflight_misconfiguration",
extra={
"error": (
f"instance type {self._instance_type!r} is not available in"
f" region {region!r} - all launches will fail with Unsupported"
),
"category": "misconfiguration",
},
)
return
missing_azs = target_azs - available_azs
if missing_azs:
log.warning(
"preflight_misconfiguration",
extra={
"error": (
f"instance type {self._instance_type!r} is not available in"
f" AZs {sorted(missing_azs)} - launches into those subnets will"
f" fail with Unsupported"
),
"category": "misconfiguration",
},
)
else:
log.info(
"preflight_ok",
extra={
"error": None,
"category": None,
},
)
except Exception as exc:
log.warning(
"preflight_validate_failed",
extra={"error": str(exc), "category": "unknown"},
)
def launch_instance(
self, slot_id: str, user_data: str, *, nested_virtualization: bool = False
) -> str:

View file

@ -63,6 +63,12 @@ in
description = "AWS region for EC2 launches.";
};
instanceType = lib.mkOption {
type = lib.types.str;
default = "";
description = "EC2 instance type for nix builders. Used for preflight availability validation at startup.";
};
launchTemplateIdFile = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
@ -329,6 +335,7 @@ in
[aws]
region = "${cfg.aws.region}"
${lib.optionalString (cfg.aws.instanceType != "") ''instance_type = "${cfg.aws.instanceType}"''}
launch_template_id = "$launch_template_id"
${lib.optionalString (
cfg.aws.onDemandLaunchTemplateIdFile != null