agent: complete plan05 closeout

This commit is contained in:
Abel Luck 2026-02-27 13:48:52 +01:00
parent 33ba248c49
commit 2f0fffa905
12 changed files with 1347 additions and 313 deletions

View file

@ -68,7 +68,7 @@ class Reconciler:
# 2. Query HAProxy
try:
haproxy_health = self._haproxy.read_slot_health()
haproxy_health = self._haproxy_read_slot_health()
except HAProxyError:
log.warning("haproxy_stat_failed", exc_info=True)
haproxy_health = {}
@ -142,8 +142,8 @@ class Reconciler:
if tailscale_ip is not None:
self._db.update_slot_state(slot["slot_id"], SlotState.BINDING, instance_ip=tailscale_ip)
try:
self._haproxy.set_slot_addr(slot["slot_id"], tailscale_ip)
self._haproxy.enable_slot(slot["slot_id"])
self._haproxy_set_slot_addr(slot["slot_id"], tailscale_ip)
self._haproxy_enable_slot(slot["slot_id"])
except HAProxyError:
log.warning(
"haproxy_binding_setup_failed",
@ -169,8 +169,8 @@ class Reconciler:
ip = slot.get("instance_ip")
if ip:
try:
self._haproxy.set_slot_addr(slot_id, ip)
self._haproxy.enable_slot(slot_id)
self._haproxy_set_slot_addr(slot_id, ip)
self._haproxy_enable_slot(slot_id)
except HAProxyError:
pass
@ -204,7 +204,7 @@ class Reconciler:
# Disable HAProxy (idempotent)
with contextlib.suppress(HAProxyError):
self._haproxy.disable_slot(slot_id)
self._haproxy_disable_slot(slot_id)
now = self._clock.now()
last_change = datetime.fromisoformat(slot["last_state_change"])
@ -216,8 +216,17 @@ class Reconciler:
if instance_id:
try:
self._runtime.terminate_instance(instance_id)
self._metrics.counter("autoscaler_ec2_terminate_total", {}, 1.0)
self._metrics.counter(
"autoscaler_ec2_terminate_total",
{"result": "success"},
1.0,
)
except Exception:
self._metrics.counter(
"autoscaler_ec2_terminate_total",
{"result": "error"},
1.0,
)
log.warning(
"terminate_failed",
extra={"slot_id": slot_id, "instance_id": instance_id},
@ -252,7 +261,70 @@ class Reconciler:
"""Emit reconciler metrics."""
summary = self._db.get_state_summary()
for state, count in summary["slots"].items():
if state == "total":
continue
self._metrics.gauge("autoscaler_slots", {"state": state}, float(count))
self._metrics.histogram_observe("autoscaler_reconciler_tick_seconds", {}, tick_duration)
self._metrics.gauge("autoscaler_slots_total", {"state": state}, float(count))
self._metrics.histogram_observe("autoscaler_reconcile_duration_seconds", {}, tick_duration)
def _haproxy_set_slot_addr(self, slot_id: str, ip: str) -> None:
try:
self._haproxy.set_slot_addr(slot_id, ip)
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "set_slot_addr", "result": "success"},
1.0,
)
except HAProxyError:
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "set_slot_addr", "result": "error"},
1.0,
)
raise
def _haproxy_enable_slot(self, slot_id: str) -> None:
try:
self._haproxy.enable_slot(slot_id)
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "enable_slot", "result": "success"},
1.0,
)
except HAProxyError:
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "enable_slot", "result": "error"},
1.0,
)
raise
def _haproxy_disable_slot(self, slot_id: str) -> None:
try:
self._haproxy.disable_slot(slot_id)
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "disable_slot", "result": "success"},
1.0,
)
except HAProxyError:
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "disable_slot", "result": "error"},
1.0,
)
raise
def _haproxy_read_slot_health(self) -> dict:
try:
health = self._haproxy.read_slot_health()
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "show_stat", "result": "success"},
1.0,
)
return health
except HAProxyError:
self._metrics.counter(
"autoscaler_haproxy_command_total",
{"cmd": "show_stat", "result": "error"},
1.0,
)
raise