Add prometheus metrics to monitor tailscalesd

This commit is contained in:
Abel Luck 2023-11-06 11:57:11 +01:00
parent b080318748
commit 2db375820f
4 changed files with 63 additions and 8 deletions

View file

@ -2,6 +2,7 @@ import asyncio
import logging
import os
import sys
from contextlib import asynccontextmanager
from ipaddress import ip_address
from typing import Dict, List
@ -9,6 +10,8 @@ import httpx
import json_logging # type: ignore
import uvicorn
from fastapi import FastAPI
from prometheus_client import Counter
from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import Field, SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict
@ -20,6 +23,17 @@ log = logging.getLogger("tailscalesd")
log.setLevel(logging.DEBUG if debug else logging.INFO)
log.addHandler(logging.StreamHandler(sys.stdout))
counter_unhandled_background_task_crashes = Counter(
"tailscalesd_unhandled_background_task_crashes",
"The number of unhandled background task crashes",
)
counter_matrix_sd_down = Counter(
"tailscalesd_matrix_sd_down",
"The number times a matrix sd host was unreachable",
["device_hostname"],
)
def ipv4_only(addresses) -> List[str]:
"""Given a list of ip addresses, returns only the ipv4 ones"""
@ -38,11 +52,6 @@ class Settings(BaseSettings):
settings = Settings() # type: ignore[call-arg]
app = FastAPI()
json_logging.init_fastapi(enable_json=True)
json_logging.init_request_instrument(app)
CACHE_SD = []
@ -60,6 +69,7 @@ async def tailscale_devices() -> List:
"Polling tailscale devices failed!",
exc_info=e,
)
counter_unhandled_background_task_crashes.inc()
return []
@ -132,6 +142,7 @@ async def matrix_sd(tailnet, devices) -> List:
try:
workers = await matrix_node_sd(device)
except Exception as e:
counter_matrix_sd_down.labels(device_hostname=device["hostname"]).inc()
log.error(
f"Failed parsing matrix node sd for device={device['hostname']}",
exc_info=e,
@ -169,15 +180,25 @@ async def poll_sd():
CACHE_SD = matrix_targets + device_targets
await asyncio.sleep(settings.interval)
except Exception as e:
counter_unhandled_background_task_crashes.inc()
log.error(
"Service Discovery poller failed",
exc_info=e,
)
@app.on_event("startup")
async def start_sd():
@asynccontextmanager
async def lifespan(app: FastAPI):
instrumentator.expose(app)
asyncio.create_task(poll_sd())
yield
app = FastAPI(lifespan=lifespan)
instrumentator = Instrumentator().instrument(app)
json_logging.init_fastapi(enable_json=True)
json_logging.init_request_instrument(app)
@app.get("/")