nix-builder-autoscaler/agent/nix_builder_autoscaler/tests/test_scheduler.py

"""Scheduler unit tests — Plan 03."""

from nix_builder_autoscaler.config import AppConfig, AwsConfig, CapacityConfig
from nix_builder_autoscaler.metrics import MetricsRegistry
from nix_builder_autoscaler.models import ReservationPhase, SlotState
from nix_builder_autoscaler.providers.clock import FakeClock
from nix_builder_autoscaler.runtime.fake import FakeRuntime
from nix_builder_autoscaler.scheduler import scheduling_tick
from nix_builder_autoscaler.state_db import StateDB


def _make_env(
    slot_count=3,
    max_slots=3,
    max_leases=1,
    idle_scale_down_seconds=900,
    target_warm=0,
    min_slots=0,
):
    clock = FakeClock()
    db = StateDB(":memory:", clock=clock)
    db.init_schema()
    db.init_slots("slot", slot_count, "x86_64-linux", "all")
    runtime = FakeRuntime(launch_latency_ticks=2, ip_delay_ticks=1)
    config = AppConfig(
        capacity=CapacityConfig(
            max_slots=max_slots,
            max_leases_per_slot=max_leases,
            idle_scale_down_seconds=idle_scale_down_seconds,
            target_warm_slots=target_warm,
            min_slots=min_slots,
            reservation_ttl_seconds=1200,
        ),
        aws=AwsConfig(region="us-east-1"),
    )
    metrics = MetricsRegistry()
    return db, runtime, config, clock, metrics


def _make_slot_ready(db, slot_id, instance_id="i-test1", ip="100.64.0.1"):
    """Transition a slot through the full state machine to ready."""
    db.update_slot_state(slot_id, SlotState.LAUNCHING, instance_id=instance_id)
    db.update_slot_state(slot_id, SlotState.BOOTING)
    db.update_slot_state(slot_id, SlotState.BINDING, instance_ip=ip)
    db.update_slot_state(slot_id, SlotState.READY)


# --- Test cases ---


def test_pending_reservation_assigned_to_ready_slot():
    db, runtime, config, clock, metrics = _make_env()
    _make_slot_ready(db, "slot001")

    resv = db.create_reservation("x86_64-linux", "test", None, 1200)

    scheduling_tick(db, runtime, config, clock, metrics)

    updated = db.get_reservation(resv["reservation_id"])
    assert updated["phase"] == ReservationPhase.READY.value
    assert updated["slot_id"] == "slot001"
    assert updated["instance_id"] == "i-test1"

    slot = db.get_slot("slot001")
    assert slot["lease_count"] == 1


def test_two_pending_one_slot_only_one_assigned_per_tick():
    db, runtime, config, clock, metrics = _make_env(max_leases=1)
    _make_slot_ready(db, "slot001")

    r1 = db.create_reservation("x86_64-linux", "test1", None, 1200)
    r2 = db.create_reservation("x86_64-linux", "test2", None, 1200)

    scheduling_tick(db, runtime, config, clock, metrics)

    u1 = db.get_reservation(r1["reservation_id"])
    u2 = db.get_reservation(r2["reservation_id"])

    ready_count = sum(1 for r in [u1, u2] if r["phase"] == ReservationPhase.READY.value)
    pending_count = sum(1 for r in [u1, u2] if r["phase"] == ReservationPhase.PENDING.value)
    assert ready_count == 1
    assert pending_count == 1

    slot = db.get_slot("slot001")
    assert slot["lease_count"] == 1


def test_reservation_expires_when_ttl_passes():
    db, runtime, config, clock, metrics = _make_env()
    config.capacity.reservation_ttl_seconds = 60

    db.create_reservation("x86_64-linux", "test", None, 60)

    clock.advance(61)
    scheduling_tick(db, runtime, config, clock, metrics)

    reservations = db.list_reservations(ReservationPhase.EXPIRED)
    assert len(reservations) == 1


def test_scale_down_starts_when_idle_exceeds_threshold():
    db, runtime, config, clock, metrics = _make_env(idle_scale_down_seconds=900)
    _make_slot_ready(db, "slot001")

    clock.advance(901)
    scheduling_tick(db, runtime, config, clock, metrics)

    slot = db.get_slot("slot001")
    assert slot["state"] == SlotState.DRAINING.value


def test_slot_does_not_drain_while_lease_count_positive():
    db, runtime, config, clock, metrics = _make_env(idle_scale_down_seconds=900)
    _make_slot_ready(db, "slot001")

    resv = db.create_reservation("x86_64-linux", "test", None, 1200)
    scheduling_tick(db, runtime, config, clock, metrics)

    # Confirm assigned
    updated = db.get_reservation(resv["reservation_id"])
    assert updated["phase"] == ReservationPhase.READY.value

    clock.advance(901)
    scheduling_tick(db, runtime, config, clock, metrics)

    slot = db.get_slot("slot001")
    assert slot["state"] == SlotState.READY.value


def test_interruption_pending_slot_moves_to_draining():
    db, runtime, config, clock, metrics = _make_env()
    _make_slot_ready(db, "slot001")

    db.update_slot_fields("slot001", interruption_pending=1)

    scheduling_tick(db, runtime, config, clock, metrics)

    slot = db.get_slot("slot001")
    assert slot["state"] == SlotState.DRAINING.value
    assert slot["interruption_pending"] == 0


def test_launch_triggered_for_unmet_demand():
    db, runtime, config, clock, metrics = _make_env()

    db.create_reservation("x86_64-linux", "test", None, 1200)

    scheduling_tick(db, runtime, config, clock, metrics)

    launching = db.list_slots(SlotState.LAUNCHING)
    assert len(launching) == 1
    assert launching[0]["instance_id"] is not None

    # FakeRuntime should have one pending instance
    managed = runtime.list_managed_instances()
    assert len(managed) == 1


def test_launch_respects_max_slots():
    db, runtime, config, clock, metrics = _make_env(max_slots=1)
    _make_slot_ready(db, "slot001")

    # Slot001 is at capacity (lease_count will be 1 after assignment)
    db.create_reservation("x86_64-linux", "test1", None, 1200)
    db.create_reservation("x86_64-linux", "test2", None, 1200)

    scheduling_tick(db, runtime, config, clock, metrics)

    # One reservation assigned, one still pending — but no new launch
    # because active_slots (1) == max_slots (1)
    launching = db.list_slots(SlotState.LAUNCHING)
    assert len(launching) == 0


def test_min_slots_maintained():
    db, runtime, config, clock, metrics = _make_env(min_slots=1)

    # No reservations, all slots empty
    scheduling_tick(db, runtime, config, clock, metrics)

    launching = db.list_slots(SlotState.LAUNCHING)
    assert len(launching) == 1


def test_scale_down_respects_min_slots():
    db, runtime, config, clock, metrics = _make_env(min_slots=1, idle_scale_down_seconds=900)
    _make_slot_ready(db, "slot001")

    clock.advance(901)
    scheduling_tick(db, runtime, config, clock, metrics)

    slot = db.get_slot("slot001")
    assert slot["state"] == SlotState.READY.value