Add prometheus metrics to monitor tailscalesd

This commit is contained in:
Abel Luck 2023-11-06 11:57:11 +01:00
parent b080318748
commit 2db375820f
4 changed files with 63 additions and 8 deletions

View file

@ -25,6 +25,7 @@ check:
$(MAKE) lint $(MAKE) lint
$(MAKE) types $(MAKE) types
$(MAKE) bandit $(MAKE) bandit
$(MAKE) test
lint: lint:
$(POETRY) flake8 $(SRC) $(POETRY) flake8 $(SRC)

33
poetry.lock generated
View file

@ -987,6 +987,37 @@ files = [
dev = ["pre-commit", "tox"] dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"] testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "prometheus-client"
version = "0.18.0"
description = "Python client for the Prometheus monitoring system."
category = "main"
optional = false
python-versions = ">=3.8"
files = [
{file = "prometheus_client-0.18.0-py3-none-any.whl", hash = "sha256:8de3ae2755f890826f4b6479e5571d4f74ac17a81345fe69a6778fdb92579184"},
{file = "prometheus_client-0.18.0.tar.gz", hash = "sha256:35f7a8c22139e2bb7ca5a698e92d38145bc8dc74c1c0bf56f25cca886a764e17"},
]
[package.extras]
twisted = ["twisted"]
[[package]]
name = "prometheus-fastapi-instrumentator"
version = "6.1.0"
description = "Instrument your FastAPI with Prometheus metrics."
category = "main"
optional = false
python-versions = ">=3.7.0,<4.0.0"
files = [
{file = "prometheus_fastapi_instrumentator-6.1.0-py3-none-any.whl", hash = "sha256:2279ac1cf5b9566a4c3a07f78c9c5ee19648ed90976ab87d73d672abc1bfa017"},
{file = "prometheus_fastapi_instrumentator-6.1.0.tar.gz", hash = "sha256:1820d7a90389ce100f7d1285495ead388818ae0882e761c1f3e6e62a410bdf13"},
]
[package.dependencies]
fastapi = ">=0.38.1,<1.0.0"
prometheus-client = ">=0.8.0,<1.0.0"
[[package]] [[package]]
name = "pycodestyle" name = "pycodestyle"
version = "2.11.1" version = "2.11.1"
@ -1512,4 +1543,4 @@ multidict = ">=4.0"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.11" python-versions = "^3.11"
content-hash = "c48151b734ebe301d09fb06d15888e2b92fd11b80e710e36aa2552082c1637ad" content-hash = "8712e625f6772d4b4126ad799b658caa72b103d756c35572d63dee3db88e1175"

View file

@ -23,6 +23,8 @@ uvicorn = "^0.24.0"
httpx = "^0.25.1" httpx = "^0.25.1"
pydantic-settings = "^2.0.3" pydantic-settings = "^2.0.3"
json-logging = "^1.3.0" json-logging = "^1.3.0"
prometheus-client = "^0.18.0"
prometheus-fastapi-instrumentator = "^6.1.0"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pytest = "*" pytest = "*"

View file

@ -2,6 +2,7 @@ import asyncio
import logging import logging
import os import os
import sys import sys
from contextlib import asynccontextmanager
from ipaddress import ip_address from ipaddress import ip_address
from typing import Dict, List from typing import Dict, List
@ -9,6 +10,8 @@ import httpx
import json_logging # type: ignore import json_logging # type: ignore
import uvicorn import uvicorn
from fastapi import FastAPI from fastapi import FastAPI
from prometheus_client import Counter
from prometheus_fastapi_instrumentator import Instrumentator
from pydantic import Field, SecretStr from pydantic import Field, SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
@ -20,6 +23,17 @@ log = logging.getLogger("tailscalesd")
log.setLevel(logging.DEBUG if debug else logging.INFO) log.setLevel(logging.DEBUG if debug else logging.INFO)
log.addHandler(logging.StreamHandler(sys.stdout)) log.addHandler(logging.StreamHandler(sys.stdout))
counter_unhandled_background_task_crashes = Counter(
"tailscalesd_unhandled_background_task_crashes",
"The number of unhandled background task crashes",
)
counter_matrix_sd_down = Counter(
"tailscalesd_matrix_sd_down",
"The number times a matrix sd host was unreachable",
["device_hostname"],
)
def ipv4_only(addresses) -> List[str]: def ipv4_only(addresses) -> List[str]:
"""Given a list of ip addresses, returns only the ipv4 ones""" """Given a list of ip addresses, returns only the ipv4 ones"""
@ -38,11 +52,6 @@ class Settings(BaseSettings):
settings = Settings() # type: ignore[call-arg] settings = Settings() # type: ignore[call-arg]
app = FastAPI()
json_logging.init_fastapi(enable_json=True)
json_logging.init_request_instrument(app)
CACHE_SD = [] CACHE_SD = []
@ -60,6 +69,7 @@ async def tailscale_devices() -> List:
"Polling tailscale devices failed!", "Polling tailscale devices failed!",
exc_info=e, exc_info=e,
) )
counter_unhandled_background_task_crashes.inc()
return [] return []
@ -132,6 +142,7 @@ async def matrix_sd(tailnet, devices) -> List:
try: try:
workers = await matrix_node_sd(device) workers = await matrix_node_sd(device)
except Exception as e: except Exception as e:
counter_matrix_sd_down.labels(device_hostname=device["hostname"]).inc()
log.error( log.error(
f"Failed parsing matrix node sd for device={device['hostname']}", f"Failed parsing matrix node sd for device={device['hostname']}",
exc_info=e, exc_info=e,
@ -169,15 +180,25 @@ async def poll_sd():
CACHE_SD = matrix_targets + device_targets CACHE_SD = matrix_targets + device_targets
await asyncio.sleep(settings.interval) await asyncio.sleep(settings.interval)
except Exception as e: except Exception as e:
counter_unhandled_background_task_crashes.inc()
log.error( log.error(
"Service Discovery poller failed", "Service Discovery poller failed",
exc_info=e, exc_info=e,
) )
@app.on_event("startup") @asynccontextmanager
async def start_sd(): async def lifespan(app: FastAPI):
instrumentator.expose(app)
asyncio.create_task(poll_sd()) asyncio.create_task(poll_sd())
yield
app = FastAPI(lifespan=lifespan)
instrumentator = Instrumentator().instrument(app)
json_logging.init_fastapi(enable_json=True)
json_logging.init_request_instrument(app)
@app.get("/") @app.get("/")