autoscaler: preflight check instance type availability at startup
This commit is contained in:
parent
02b1a063ab
commit
1a355429cb
4 changed files with 83 additions and 0 deletions
|
|
@ -127,6 +127,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
runtime = EC2Runtime(config.aws)
|
runtime = EC2Runtime(config.aws)
|
||||||
|
runtime.preflight_validate()
|
||||||
haproxy = HAProxyRuntime(
|
haproxy = HAProxyRuntime(
|
||||||
config.haproxy.runtime_socket,
|
config.haproxy.runtime_socket,
|
||||||
config.haproxy.backend,
|
config.haproxy.backend,
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ class AwsConfig:
|
||||||
"""[aws] section."""
|
"""[aws] section."""
|
||||||
|
|
||||||
region: str = "us-east-1"
|
region: str = "us-east-1"
|
||||||
|
instance_type: str = ""
|
||||||
launch_template_id: str = ""
|
launch_template_id: str = ""
|
||||||
on_demand_launch_template_id: str = ""
|
on_demand_launch_template_id: str = ""
|
||||||
subnet_ids: list[str] = field(default_factory=list)
|
subnet_ids: list[str] = field(default_factory=list)
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,7 @@ class EC2Runtime(RuntimeAdapter):
|
||||||
_tailscale_socket_path: str = "/run/tailscale/tailscaled.sock",
|
_tailscale_socket_path: str = "/run/tailscale/tailscaled.sock",
|
||||||
) -> None:
|
) -> None:
|
||||||
self._client: Any = _client or boto3.client("ec2", region_name=config.region)
|
self._client: Any = _client or boto3.client("ec2", region_name=config.region)
|
||||||
|
self._instance_type = config.instance_type
|
||||||
self._launch_template_id = config.launch_template_id
|
self._launch_template_id = config.launch_template_id
|
||||||
self._on_demand_launch_template_id = config.on_demand_launch_template_id
|
self._on_demand_launch_template_id = config.on_demand_launch_template_id
|
||||||
self._subnet_ids = list(config.subnet_ids)
|
self._subnet_ids = list(config.subnet_ids)
|
||||||
|
|
@ -68,6 +69,79 @@ class EC2Runtime(RuntimeAdapter):
|
||||||
self._subnet_index = 0
|
self._subnet_index = 0
|
||||||
self._tailscale_socket_path = _tailscale_socket_path
|
self._tailscale_socket_path = _tailscale_socket_path
|
||||||
|
|
||||||
|
def preflight_validate(self) -> None:
|
||||||
|
"""Check that the configured instance type is available in the configured subnets' AZs.
|
||||||
|
|
||||||
|
Logs a clear error if the instance type is absent from the region or missing from
|
||||||
|
any subnet AZ so misconfigurations are surfaced at startup rather than discovered
|
||||||
|
silently on every failed launch attempt. Never raises; API failures are logged as
|
||||||
|
warnings so a transient permissions issue does not prevent startup.
|
||||||
|
"""
|
||||||
|
if not self._instance_type:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
target_azs: set[str] = set()
|
||||||
|
if self._subnet_ids:
|
||||||
|
resp = self._client.describe_subnets(SubnetIds=self._subnet_ids)
|
||||||
|
for subnet in resp.get("Subnets", []):
|
||||||
|
az = subnet.get("AvailabilityZone")
|
||||||
|
if az:
|
||||||
|
target_azs.add(az)
|
||||||
|
|
||||||
|
filters: list[dict[str, Any]] = [
|
||||||
|
{"Name": "instance-type", "Values": [self._instance_type]},
|
||||||
|
]
|
||||||
|
if target_azs:
|
||||||
|
filters.append({"Name": "location", "Values": list(target_azs)})
|
||||||
|
|
||||||
|
resp = self._client.describe_instance_type_offerings(
|
||||||
|
LocationType="availability-zone",
|
||||||
|
Filters=filters,
|
||||||
|
)
|
||||||
|
available_azs = {o["Location"] for o in resp.get("InstanceTypeOfferings", [])}
|
||||||
|
|
||||||
|
if not available_azs:
|
||||||
|
region = self._client.meta.region_name
|
||||||
|
log.error(
|
||||||
|
"preflight_misconfiguration",
|
||||||
|
extra={
|
||||||
|
"error": (
|
||||||
|
f"instance type {self._instance_type!r} is not available in"
|
||||||
|
f" region {region!r} - all launches will fail with Unsupported"
|
||||||
|
),
|
||||||
|
"category": "misconfiguration",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
missing_azs = target_azs - available_azs
|
||||||
|
if missing_azs:
|
||||||
|
log.warning(
|
||||||
|
"preflight_misconfiguration",
|
||||||
|
extra={
|
||||||
|
"error": (
|
||||||
|
f"instance type {self._instance_type!r} is not available in"
|
||||||
|
f" AZs {sorted(missing_azs)} - launches into those subnets will"
|
||||||
|
f" fail with Unsupported"
|
||||||
|
),
|
||||||
|
"category": "misconfiguration",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
log.info(
|
||||||
|
"preflight_ok",
|
||||||
|
extra={
|
||||||
|
"error": None,
|
||||||
|
"category": None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning(
|
||||||
|
"preflight_validate_failed",
|
||||||
|
extra={"error": str(exc), "category": "unknown"},
|
||||||
|
)
|
||||||
|
|
||||||
def launch_instance(
|
def launch_instance(
|
||||||
self, slot_id: str, user_data: str, *, nested_virtualization: bool = False
|
self, slot_id: str, user_data: str, *, nested_virtualization: bool = False
|
||||||
) -> str:
|
) -> str:
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,12 @@ in
|
||||||
description = "AWS region for EC2 launches.";
|
description = "AWS region for EC2 launches.";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
instanceType = lib.mkOption {
|
||||||
|
type = lib.types.str;
|
||||||
|
default = "";
|
||||||
|
description = "EC2 instance type for nix builders. Used for preflight availability validation at startup.";
|
||||||
|
};
|
||||||
|
|
||||||
launchTemplateIdFile = lib.mkOption {
|
launchTemplateIdFile = lib.mkOption {
|
||||||
type = lib.types.nullOr lib.types.str;
|
type = lib.types.nullOr lib.types.str;
|
||||||
default = null;
|
default = null;
|
||||||
|
|
@ -329,6 +335,7 @@ in
|
||||||
|
|
||||||
[aws]
|
[aws]
|
||||||
region = "${cfg.aws.region}"
|
region = "${cfg.aws.region}"
|
||||||
|
${lib.optionalString (cfg.aws.instanceType != "") ''instance_type = "${cfg.aws.instanceType}"''}
|
||||||
launch_template_id = "$launch_template_id"
|
launch_template_id = "$launch_template_id"
|
||||||
${lib.optionalString (
|
${lib.optionalString (
|
||||||
cfg.aws.onDemandLaunchTemplateIdFile != null
|
cfg.aws.onDemandLaunchTemplateIdFile != null
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue