From 02b1a063aba04f9a456c84029cac11a46723679a Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Sat, 28 Feb 2026 10:33:26 +0100 Subject: [PATCH] support dual launch templates: spot for normal builds, on-demand for nested virtualization AWS does not allow cpu_options.nested_virtualization with spot instances. Add a second launch template (on-demand, cpu_options enabled) alongside the existing spot template. The autoscaler selects the template per-system based on nested_virtualization config. - RuntimeAdapter.launch_spot -> launch_instance(nested_virtualization=False) - EC2Runtime: selects spot or on-demand LT; raises misconfiguration error if on_demand_launch_template_id is empty when nested_virtualization=True - AwsConfig: add on_demand_launch_template_id field - SystemConfig: add nested_virtualization field - Scheduler: looks up system config to pass nested_virtualization flag - NixOS module: new aws.onDemandLaunchTemplateIdFile + capacity.nestedVirtualization options; assertion prevents enabling nestedVirtualization without the LT ID file --- agent/nix_builder_autoscaler/config.py | 2 + agent/nix_builder_autoscaler/logging.py | 12 +++++- agent/nix_builder_autoscaler/runtime/base.py | 10 ++++- agent/nix_builder_autoscaler/runtime/ec2.py | 40 ++++++++++++++----- agent/nix_builder_autoscaler/runtime/fake.py | 6 ++- agent/nix_builder_autoscaler/scheduler.py | 5 ++- .../tests/test_runtime_ec2.py | 12 +++--- .../tests/test_runtime_fake.py | 24 +++++------ .../nixos/services/nix-builder-autoscaler.nix | 25 +++++++++++- 9 files changed, 101 insertions(+), 35 deletions(-) diff --git a/agent/nix_builder_autoscaler/config.py b/agent/nix_builder_autoscaler/config.py index 2cc2a72..deb0b0a 100644 --- a/agent/nix_builder_autoscaler/config.py +++ b/agent/nix_builder_autoscaler/config.py @@ -23,6 +23,7 @@ class AwsConfig: region: str = "us-east-1" launch_template_id: str = "" + on_demand_launch_template_id: str = "" subnet_ids: list[str] = field(default_factory=list) security_group_ids: list[str] = field(default_factory=list) instance_profile_arn: str = "" @@ -51,6 +52,7 @@ class SystemConfig: launch_batch_size: int = 1 scale_down_idle_seconds: int = 900 termination_cooldown_seconds: int = 180 + nested_virtualization: bool = False @dataclass diff --git a/agent/nix_builder_autoscaler/logging.py b/agent/nix_builder_autoscaler/logging.py index 23fabf9..3d9af2d 100644 --- a/agent/nix_builder_autoscaler/logging.py +++ b/agent/nix_builder_autoscaler/logging.py @@ -12,7 +12,17 @@ from typing import Any class JSONFormatter(logging.Formatter): """Format log records as single-line JSON.""" - EXTRA_FIELDS = ("slot_id", "reservation_id", "instance_id", "request_id", "error", "category", "count", "ids", "idle_seconds") + EXTRA_FIELDS = ( + "slot_id", + "reservation_id", + "instance_id", + "request_id", + "error", + "category", + "count", + "ids", + "idle_seconds", + ) def format(self, record: logging.LogRecord) -> str: """Format a log record as JSON.""" diff --git a/agent/nix_builder_autoscaler/runtime/base.py b/agent/nix_builder_autoscaler/runtime/base.py index 4841f6b..da5d913 100644 --- a/agent/nix_builder_autoscaler/runtime/base.py +++ b/agent/nix_builder_autoscaler/runtime/base.py @@ -21,8 +21,14 @@ class RuntimeAdapter(ABC): """Interface for compute runtime backends (EC2, fake, etc.).""" @abstractmethod - def launch_spot(self, slot_id: str, user_data: str) -> str: - """Launch a spot instance for slot_id. Return instance_id.""" + def launch_instance( + self, slot_id: str, user_data: str, *, nested_virtualization: bool = False + ) -> str: + """Launch an instance for slot_id. Return instance_id. + + When nested_virtualization is True, an on-demand instance is launched using + the on-demand launch template. When False (default), a spot instance is launched. + """ @abstractmethod def describe_instance(self, instance_id: str) -> dict: diff --git a/agent/nix_builder_autoscaler/runtime/ec2.py b/agent/nix_builder_autoscaler/runtime/ec2.py index fda0117..f04bbfb 100644 --- a/agent/nix_builder_autoscaler/runtime/ec2.py +++ b/agent/nix_builder_autoscaler/runtime/ec2.py @@ -60,6 +60,7 @@ class EC2Runtime(RuntimeAdapter): ) -> None: self._client: Any = _client or boto3.client("ec2", region_name=config.region) self._launch_template_id = config.launch_template_id + self._on_demand_launch_template_id = config.on_demand_launch_template_id self._subnet_ids = list(config.subnet_ids) self._security_group_ids = list(config.security_group_ids) self._instance_profile_arn = config.instance_profile_arn @@ -67,22 +68,32 @@ class EC2Runtime(RuntimeAdapter): self._subnet_index = 0 self._tailscale_socket_path = _tailscale_socket_path - def launch_spot(self, slot_id: str, user_data: str) -> str: - """Launch a spot instance for *slot_id*. Return instance ID.""" + def launch_instance( + self, slot_id: str, user_data: str, *, nested_virtualization: bool = False + ) -> str: + """Launch an instance for *slot_id*. Return instance ID. + + When nested_virtualization is True, an on-demand instance is launched using the + on-demand launch template (cpu_options nested virt enabled, no spot market options). + When False (default), a spot instance is launched using the spot launch template. + """ + if nested_virtualization: + if not self._on_demand_launch_template_id: + raise RuntimeAdapterError( + "nested_virtualization=True but on_demand_launch_template_id is not configured", + category="misconfiguration", + ) + lt_id = self._on_demand_launch_template_id + else: + lt_id = self._launch_template_id + params: dict[str, Any] = { "MinCount": 1, "MaxCount": 1, "LaunchTemplate": { - "LaunchTemplateId": self._launch_template_id, + "LaunchTemplateId": lt_id, "Version": "$Latest", }, - "InstanceMarketOptions": { - "MarketType": "spot", - "SpotOptions": { - "SpotInstanceType": "one-time", - "InstanceInterruptionBehavior": "terminate", - }, - }, "UserData": user_data, "TagSpecifications": [ { @@ -98,6 +109,15 @@ class EC2Runtime(RuntimeAdapter): ], } + if not nested_virtualization: + params["InstanceMarketOptions"] = { + "MarketType": "spot", + "SpotOptions": { + "SpotInstanceType": "one-time", + "InstanceInterruptionBehavior": "terminate", + }, + } + if self._subnet_ids: subnet = self._subnet_ids[self._subnet_index % len(self._subnet_ids)] self._subnet_index += 1 diff --git a/agent/nix_builder_autoscaler/runtime/fake.py b/agent/nix_builder_autoscaler/runtime/fake.py index cb1d481..0206760 100644 --- a/agent/nix_builder_autoscaler/runtime/fake.py +++ b/agent/nix_builder_autoscaler/runtime/fake.py @@ -38,8 +38,10 @@ class FakeRuntime(RuntimeAdapter): self._tick_count: int = 0 self._next_ip_counter: int = 1 - def launch_spot(self, slot_id: str, user_data: str) -> str: - """Launch a fake spot instance.""" + def launch_instance( + self, slot_id: str, user_data: str, *, nested_virtualization: bool = False + ) -> str: + """Launch a fake instance (nested_virtualization is accepted but ignored).""" if slot_id in self._launch_failures: self._launch_failures.discard(slot_id) raise RuntimeAdapterError( diff --git a/agent/nix_builder_autoscaler/scheduler.py b/agent/nix_builder_autoscaler/scheduler.py index d7f661a..965df54 100644 --- a/agent/nix_builder_autoscaler/scheduler.py +++ b/agent/nix_builder_autoscaler/scheduler.py @@ -245,8 +245,11 @@ def _launch_slot( """Launch a single slot. Transition to LAUNCHING on success, ERROR on failure.""" slot_id = slot["slot_id"] user_data = render_userdata(slot_id) + system_name = slot.get("system", config.capacity.default_system) + sys_cfg = next((s for s in config.systems if s.name == system_name), None) + nested_virt = sys_cfg.nested_virtualization if sys_cfg else False try: - instance_id = runtime.launch_spot(slot_id, user_data) + instance_id = runtime.launch_instance(slot_id, user_data, nested_virtualization=nested_virt) metrics.counter("autoscaler_ec2_launch_total", {"result": "success"}, 1.0) db.update_slot_state(slot_id, SlotState.LAUNCHING, instance_id=instance_id) log.info("slot_launched", extra={"slot_id": slot_id, "instance_id": instance_id}) diff --git a/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py b/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py index a82c53f..de8b26a 100644 --- a/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py +++ b/agent/nix_builder_autoscaler/tests/test_runtime_ec2.py @@ -73,7 +73,7 @@ class TestLaunchSpot: stubber.add_response("run_instances", response, expected_params) runtime = _make_runtime(stubber, ec2_client, config=config) - iid = runtime.launch_spot("slot001", "#!/bin/bash\necho hello") + iid = runtime.launch_instance("slot001", "#!/bin/bash\necho hello") assert iid == "i-12345678" stubber.assert_no_pending_responses() @@ -90,8 +90,8 @@ class TestLaunchSpot: ) runtime = _make_runtime(stubber, ec2_client, config=config) - runtime.launch_spot("slot001", "") - runtime.launch_spot("slot002", "") + runtime.launch_instance("slot001", "") + runtime.launch_instance("slot002", "") stubber.assert_no_pending_responses() @@ -418,7 +418,7 @@ class TestErrorClassification: runtime = _make_runtime(stubber, ec2_client) with pytest.raises(RuntimeAdapterError) as exc_info: - runtime.launch_spot("slot001", "#!/bin/bash") + runtime.launch_instance("slot001", "#!/bin/bash") assert exc_info.value.category == "capacity_unavailable" @patch("nix_builder_autoscaler.runtime.ec2.time.sleep") @@ -439,7 +439,7 @@ class TestErrorClassification: ) runtime = _make_runtime(stubber, ec2_client) - iid = runtime.launch_spot("slot001", "#!/bin/bash") + iid = runtime.launch_instance("slot001", "#!/bin/bash") assert iid == "i-retry123" assert mock_sleep.called stubber.assert_no_pending_responses() @@ -460,5 +460,5 @@ class TestErrorClassification: runtime = _make_runtime(stubber, ec2_client) with pytest.raises(RuntimeAdapterError) as exc_info: - runtime.launch_spot("slot001", "#!/bin/bash") + runtime.launch_instance("slot001", "#!/bin/bash") assert exc_info.value.category == "throttled" diff --git a/agent/nix_builder_autoscaler/tests/test_runtime_fake.py b/agent/nix_builder_autoscaler/tests/test_runtime_fake.py index 9f90ea5..d18ff2c 100644 --- a/agent/nix_builder_autoscaler/tests/test_runtime_fake.py +++ b/agent/nix_builder_autoscaler/tests/test_runtime_fake.py @@ -9,13 +9,13 @@ from nix_builder_autoscaler.runtime.fake import FakeRuntime class TestLaunchSpot: def test_returns_synthetic_instance_id(self): rt = FakeRuntime() - iid = rt.launch_spot("slot001", "#!/bin/bash\necho hello") + iid = rt.launch_instance("slot001", "#!/bin/bash\necho hello") assert iid.startswith("i-fake-") assert len(iid) > 10 def test_instance_starts_pending(self): rt = FakeRuntime() - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") info = rt.describe_instance(iid) assert info["state"] == "pending" assert info["tailscale_ip"] is None @@ -24,7 +24,7 @@ class TestLaunchSpot: class TestTickProgression: def test_transitions_to_running_after_configured_ticks(self): rt = FakeRuntime(launch_latency_ticks=3, ip_delay_ticks=1) - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") for _ in range(2): rt.tick() @@ -35,7 +35,7 @@ class TestTickProgression: def test_tailscale_ip_appears_after_configured_delay(self): rt = FakeRuntime(launch_latency_ticks=2, ip_delay_ticks=2) - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") for _ in range(2): rt.tick() @@ -56,7 +56,7 @@ class TestInjectedFailure: rt = FakeRuntime() rt.inject_launch_failure("slot001") try: - rt.launch_spot("slot001", "") + rt.launch_instance("slot001", "") raise AssertionError("Should have raised") except RuntimeAdapterError as e: assert e.category == "capacity_unavailable" @@ -65,16 +65,16 @@ class TestInjectedFailure: rt = FakeRuntime() rt.inject_launch_failure("slot001") with contextlib.suppress(RuntimeAdapterError): - rt.launch_spot("slot001", "") + rt.launch_instance("slot001", "") # Second call should succeed - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") assert iid.startswith("i-fake-") class TestInjectedInterruption: def test_interruption_returns_terminated(self): rt = FakeRuntime(launch_latency_ticks=1) - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") rt.tick() assert rt.describe_instance(iid)["state"] == "running" @@ -85,7 +85,7 @@ class TestInjectedInterruption: def test_interruption_is_one_shot(self): """After the interruption fires, subsequent describes stay terminated.""" rt = FakeRuntime(launch_latency_ticks=1) - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") rt.tick() rt.inject_interruption(iid) rt.describe_instance(iid) # consumes the injection @@ -96,7 +96,7 @@ class TestInjectedInterruption: class TestTerminate: def test_terminate_marks_instance(self): rt = FakeRuntime(launch_latency_ticks=1) - iid = rt.launch_spot("slot001", "") + iid = rt.launch_instance("slot001", "") rt.tick() rt.terminate_instance(iid) assert rt.describe_instance(iid)["state"] == "terminated" @@ -105,8 +105,8 @@ class TestTerminate: class TestListManaged: def test_lists_non_terminated(self): rt = FakeRuntime(launch_latency_ticks=1) - iid1 = rt.launch_spot("slot001", "") - iid2 = rt.launch_spot("slot002", "") + iid1 = rt.launch_instance("slot001", "") + iid2 = rt.launch_instance("slot002", "") rt.tick() rt.terminate_instance(iid1) diff --git a/nix/modules/nixos/services/nix-builder-autoscaler.nix b/nix/modules/nixos/services/nix-builder-autoscaler.nix index 1ad4368..1fdcfe6 100644 --- a/nix/modules/nixos/services/nix-builder-autoscaler.nix +++ b/nix/modules/nixos/services/nix-builder-autoscaler.nix @@ -66,7 +66,13 @@ in launchTemplateIdFile = lib.mkOption { type = lib.types.nullOr lib.types.str; default = null; - description = "Runtime file containing the EC2 launch template ID."; + description = "Runtime file containing the EC2 spot launch template ID."; + }; + + onDemandLaunchTemplateIdFile = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Runtime file containing the EC2 on-demand launch template ID (required when capacity.nestedVirtualization is true)."; }; subnetIdsJsonFile = lib.mkOption { @@ -216,6 +222,12 @@ in default = 1; description = "Launch batch size for the default system entry."; }; + + nestedVirtualization = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Whether slots use on-demand instances with nested virtualization. Requires aws.onDemandLaunchTemplateIdFile to be set."; + }; }; security = { @@ -256,6 +268,10 @@ in assertion = cfg.aws.subnetIdsJsonFile != null; message = "services.nix-builder-autoscaler.aws.subnetIdsJsonFile must be set."; } + { + assertion = !cfg.capacity.nestedVirtualization || cfg.aws.onDemandLaunchTemplateIdFile != null; + message = "services.nix-builder-autoscaler.aws.onDemandLaunchTemplateIdFile must be set when capacity.nestedVirtualization is true."; + } ]; environment.systemPackages = [ cfg.package ]; @@ -301,6 +317,9 @@ in install -d -m 0750 -o ${cfg.user} -g ${cfg.group} /run/nix-builder-autoscaler launch_template_id="$(tr -d '\n' < ${lib.escapeShellArg cfg.aws.launchTemplateIdFile})" subnet_ids_json="$(tr -d '\n' < ${lib.escapeShellArg cfg.aws.subnetIdsJsonFile})" + ${lib.optionalString (cfg.aws.onDemandLaunchTemplateIdFile != null) '' + on_demand_launch_template_id="$(tr -d '\n' < ${lib.escapeShellArg cfg.aws.onDemandLaunchTemplateIdFile})" + ''} cat > ${generatedConfigPath} <