agent: complete plan05 closeout
This commit is contained in:
parent
33ba248c49
commit
d8c925b817
12 changed files with 1347 additions and 313 deletions
|
|
@ -217,12 +217,13 @@ def _launch_slot(
|
|||
"""Launch a single slot. Transition to LAUNCHING on success, ERROR on failure."""
|
||||
slot_id = slot["slot_id"]
|
||||
user_data = render_userdata(slot_id, config.aws.region)
|
||||
metrics.counter("autoscaler_ec2_launch_total", {}, 1.0)
|
||||
try:
|
||||
instance_id = runtime.launch_spot(slot_id, user_data)
|
||||
metrics.counter("autoscaler_ec2_launch_total", {"result": "success"}, 1.0)
|
||||
db.update_slot_state(slot_id, SlotState.LAUNCHING, instance_id=instance_id)
|
||||
log.info("slot_launched", extra={"slot_id": slot_id, "instance_id": instance_id})
|
||||
except RuntimeAdapterError as exc:
|
||||
metrics.counter("autoscaler_ec2_launch_total", {"result": exc.category}, 1.0)
|
||||
db.update_slot_state(slot_id, SlotState.ERROR)
|
||||
log.warning(
|
||||
"slot_launch_failed",
|
||||
|
|
@ -257,11 +258,9 @@ def _update_metrics(db: StateDB, metrics: MetricsRegistry, tick_duration: float)
|
|||
summary = db.get_state_summary()
|
||||
|
||||
for state, count in summary["slots"].items():
|
||||
if state == "total":
|
||||
continue
|
||||
metrics.gauge("autoscaler_slots", {"state": state}, float(count))
|
||||
metrics.gauge("autoscaler_slots_total", {"state": state}, float(count))
|
||||
|
||||
for phase, count in summary["reservations"].items():
|
||||
metrics.gauge("autoscaler_reservations", {"phase": phase}, float(count))
|
||||
metrics.gauge("autoscaler_reservations_total", {"phase": phase}, float(count))
|
||||
|
||||
metrics.histogram_observe("autoscaler_scheduler_tick_seconds", {}, tick_duration)
|
||||
metrics.histogram_observe("autoscaler_scheduler_tick_duration_seconds", {}, tick_duration)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue