nix-builder-autoscaler/agent/nix_builder_autoscaler/config.py
Abel Luck 02b1a063ab
Some checks failed
buildbot/nix-eval Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-nix-builder-autoscaler Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-default Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-autoscalerctl Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-default Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.app-nix-builder-autoscaler Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-pyright Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-integration-tests Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-ruff Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.nix-builder-autoscaler-unit-tests Build done.
buildbot/nix-build gitea:ops/nix-builder-autoscaler#checks.x86_64-linux.package-buildbot-autoscale-ext Build done.
buildbot/nix-build Build done.
support dual launch templates: spot for normal builds, on-demand for nested virtualization
AWS does not allow cpu_options.nested_virtualization with spot instances. Add a second
launch template (on-demand, cpu_options enabled) alongside the existing spot template.
The autoscaler selects the template per-system based on nested_virtualization config.

- RuntimeAdapter.launch_spot -> launch_instance(nested_virtualization=False)
- EC2Runtime: selects spot or on-demand LT; raises misconfiguration error if
  on_demand_launch_template_id is empty when nested_virtualization=True
- AwsConfig: add on_demand_launch_template_id field
- SystemConfig: add nested_virtualization field
- Scheduler: looks up system config to pass nested_virtualization flag
- NixOS module: new aws.onDemandLaunchTemplateIdFile + capacity.nestedVirtualization
  options; assertion prevents enabling nestedVirtualization without the LT ID file
2026-02-28 10:33:26 +01:00

163 lines
4.8 KiB
Python

"""Configuration loading from TOML with environment variable overrides."""
from __future__ import annotations
import os
import tomllib
from dataclasses import dataclass, field
from pathlib import Path
@dataclass
class ServerConfig:
"""[server] section."""
socket_path: str = "/run/nix-builder-autoscaler/daemon.sock"
log_level: str = "info"
db_path: str = "/var/lib/nix-builder-autoscaler/state.db"
@dataclass
class AwsConfig:
"""[aws] section."""
region: str = "us-east-1"
launch_template_id: str = ""
on_demand_launch_template_id: str = ""
subnet_ids: list[str] = field(default_factory=list)
security_group_ids: list[str] = field(default_factory=list)
instance_profile_arn: str = ""
@dataclass
class HaproxyConfig:
"""[haproxy] section."""
runtime_socket: str = "/run/haproxy/admin.sock"
backend: str = "all"
slot_prefix: str = "slot"
slot_count: int = 8
check_ready_up_count: int = 2
@dataclass
class SystemConfig:
"""[[systems]] entry for per-architecture capacity policy."""
name: str = "x86_64-linux"
min_slots: int = 0
max_slots: int = 8
target_warm_slots: int = 0
max_leases_per_slot: int = 1
launch_batch_size: int = 1
scale_down_idle_seconds: int = 900
termination_cooldown_seconds: int = 180
nested_virtualization: bool = False
@dataclass
class CapacityConfig:
"""[capacity] section — global defaults."""
default_system: str = "x86_64-linux"
min_slots: int = 0
max_slots: int = 8
target_warm_slots: int = 0
max_leases_per_slot: int = 1
reservation_ttl_seconds: int = 1200
idle_scale_down_seconds: int = 900
drain_timeout_seconds: int = 120
launch_timeout_seconds: int = 300
boot_timeout_seconds: int = 300
binding_timeout_seconds: int = 180
terminating_timeout_seconds: int = 300
termination_cooldown_seconds: int = 180
@dataclass
class SecurityConfig:
"""[security] section."""
socket_mode: str = "0660"
socket_owner: str = "buildbot"
socket_group: str = "buildbot"
@dataclass
class SchedulerConfig:
"""[scheduler] section."""
tick_seconds: float = 3.0
reconcile_seconds: float = 15.0
@dataclass
class AppConfig:
"""Top-level application configuration."""
server: ServerConfig = field(default_factory=ServerConfig)
aws: AwsConfig = field(default_factory=AwsConfig)
haproxy: HaproxyConfig = field(default_factory=HaproxyConfig)
capacity: CapacityConfig = field(default_factory=CapacityConfig)
security: SecurityConfig = field(default_factory=SecurityConfig)
scheduler: SchedulerConfig = field(default_factory=SchedulerConfig)
systems: list[SystemConfig] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Environment variable overrides
# ---------------------------------------------------------------------------
# AUTOSCALER_TAILSCALE_API_TOKEN — Tailscale API token for IP discovery
# AWS_REGION — override aws.region
# AWS_ACCESS_KEY_ID — explicit AWS credential
# AWS_SECRET_ACCESS_KEY — explicit AWS credential
def _apply_env_overrides(cfg: AppConfig) -> None:
"""Apply environment variable overrides for secrets and region."""
region = os.environ.get("AWS_REGION")
if region:
cfg.aws.region = region
def _build_dataclass(cls: type, data: dict) -> object: # noqa: ANN001
"""Construct a dataclass from a dict, ignoring unknown keys."""
valid = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
return cls(**{k: v for k, v in data.items() if k in valid})
def load_config(path: Path) -> AppConfig:
"""Load configuration from a TOML file.
Args:
path: Path to the TOML config file.
Returns:
Validated AppConfig instance.
"""
with open(path, "rb") as f:
raw = tomllib.load(f)
cfg = AppConfig()
if "server" in raw:
cfg.server = _build_dataclass(ServerConfig, raw["server"]) # type: ignore[assignment]
if "aws" in raw:
cfg.aws = _build_dataclass(AwsConfig, raw["aws"]) # type: ignore[assignment]
if "haproxy" in raw:
cfg.haproxy = _build_dataclass(HaproxyConfig, raw["haproxy"]) # type: ignore[assignment]
if "capacity" in raw:
cfg.capacity = _build_dataclass(CapacityConfig, raw["capacity"]) # type: ignore[assignment]
if "security" in raw:
cfg.security = _build_dataclass(SecurityConfig, raw["security"]) # type: ignore[assignment]
if "scheduler" in raw:
cfg.scheduler = _build_dataclass(SchedulerConfig, raw["scheduler"]) # type: ignore[assignment]
if "systems" in raw:
cfg.systems = list[SystemConfig](
_build_dataclass(SystemConfig, s) # type: ignore[list-item]
for s in raw["systems"]
)
_apply_env_overrides(cfg)
return cfg