Add prometheus metrics to monitor tailscalesd
This commit is contained in:
parent
b080318748
commit
2db375820f
4 changed files with 63 additions and 8 deletions
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
import logging
|
||||
import os
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from ipaddress import ip_address
|
||||
from typing import Dict, List
|
||||
|
||||
|
|
@ -9,6 +10,8 @@ import httpx
|
|||
import json_logging # type: ignore
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from prometheus_client import Counter
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from pydantic import Field, SecretStr
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
|
@ -20,6 +23,17 @@ log = logging.getLogger("tailscalesd")
|
|||
log.setLevel(logging.DEBUG if debug else logging.INFO)
|
||||
log.addHandler(logging.StreamHandler(sys.stdout))
|
||||
|
||||
counter_unhandled_background_task_crashes = Counter(
|
||||
"tailscalesd_unhandled_background_task_crashes",
|
||||
"The number of unhandled background task crashes",
|
||||
)
|
||||
|
||||
counter_matrix_sd_down = Counter(
|
||||
"tailscalesd_matrix_sd_down",
|
||||
"The number times a matrix sd host was unreachable",
|
||||
["device_hostname"],
|
||||
)
|
||||
|
||||
|
||||
def ipv4_only(addresses) -> List[str]:
|
||||
"""Given a list of ip addresses, returns only the ipv4 ones"""
|
||||
|
|
@ -38,11 +52,6 @@ class Settings(BaseSettings):
|
|||
|
||||
|
||||
settings = Settings() # type: ignore[call-arg]
|
||||
app = FastAPI()
|
||||
|
||||
json_logging.init_fastapi(enable_json=True)
|
||||
json_logging.init_request_instrument(app)
|
||||
|
||||
CACHE_SD = []
|
||||
|
||||
|
||||
|
|
@ -60,6 +69,7 @@ async def tailscale_devices() -> List:
|
|||
"Polling tailscale devices failed!",
|
||||
exc_info=e,
|
||||
)
|
||||
counter_unhandled_background_task_crashes.inc()
|
||||
return []
|
||||
|
||||
|
||||
|
|
@ -132,6 +142,7 @@ async def matrix_sd(tailnet, devices) -> List:
|
|||
try:
|
||||
workers = await matrix_node_sd(device)
|
||||
except Exception as e:
|
||||
counter_matrix_sd_down.labels(device_hostname=device["hostname"]).inc()
|
||||
log.error(
|
||||
f"Failed parsing matrix node sd for device={device['hostname']}",
|
||||
exc_info=e,
|
||||
|
|
@ -169,15 +180,25 @@ async def poll_sd():
|
|||
CACHE_SD = matrix_targets + device_targets
|
||||
await asyncio.sleep(settings.interval)
|
||||
except Exception as e:
|
||||
counter_unhandled_background_task_crashes.inc()
|
||||
log.error(
|
||||
"Service Discovery poller failed",
|
||||
exc_info=e,
|
||||
)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def start_sd():
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
instrumentator.expose(app)
|
||||
asyncio.create_task(poll_sd())
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
instrumentator = Instrumentator().instrument(app)
|
||||
|
||||
json_logging.init_fastapi(enable_json=True)
|
||||
json_logging.init_request_instrument(app)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue