Add operational metrics for webhook and Matrix health

This commit is contained in:
Abel Luck 2026-03-06 11:57:48 +01:00
parent 2fde5ffc87
commit 0056f819b6
3 changed files with 195 additions and 26 deletions

View file

@ -2,6 +2,7 @@ import asyncio
import logging import logging
import os import os
import sys import sys
import time
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any, AsyncIterator, Dict, List, Optional, Protocol, Tuple, cast from typing import Any, AsyncIterator, Dict, List, Optional, Protocol, Tuple, cast
@ -20,6 +21,16 @@ from ops_bot import alertmanager, aws, pagerduty
from ops_bot.config import BotSettings, RoutingKey, load_config from ops_bot.config import BotSettings, RoutingKey, load_config
from ops_bot.gitlab import hook as gitlab_hook from ops_bot.gitlab import hook as gitlab_hook
from ops_bot.matrix import MatrixClient from ops_bot.matrix import MatrixClient
from ops_bot.metrics import (
CONFIG_LOADED_TOTAL,
EVENT_TO_SEND_SECONDS,
MESSAGES_SENT_TOTAL,
MESSAGE_SEND_FAILURES_TOTAL,
WEBHOOK_EVENTS_TOTAL,
classify_payload_error,
classify_send_failure,
source_label,
)
async def get_matrix_service(request: Request) -> MatrixClient: async def get_matrix_service(request: Request) -> MatrixClient:
@ -36,8 +47,15 @@ async def matrix_main(matrix_client: MatrixClient) -> None:
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]: async def lifespan(app: FastAPI) -> AsyncIterator[None]:
config_fname = os.environ.get("BOT_CONFIG_FILE", "config.json") config_fname = os.environ.get("BOT_CONFIG_FILE", "config.json")
bot_settings = load_config(config_fname) try:
c = MatrixClient(settings=bot_settings.matrix, join_rooms=bot_settings.get_rooms()) bot_settings = load_config(config_fname)
c = MatrixClient(
settings=bot_settings.matrix, join_rooms=bot_settings.get_rooms()
)
except Exception:
CONFIG_LOADED_TOTAL.labels(result="failure").inc()
raise
CONFIG_LOADED_TOTAL.labels(result="success").inc()
app.state.matrix_client = c app.state.matrix_client = c
app.state.bot_settings = bot_settings app.state.bot_settings = bot_settings
asyncio.create_task(matrix_main(c)) asyncio.create_task(matrix_main(c))
@ -131,16 +149,20 @@ async def webhook_handler(
), ),
matrix_client: MatrixClient = Depends(get_matrix_service), matrix_client: MatrixClient = Depends(get_matrix_service),
) -> Dict[str, str]: ) -> Dict[str, str]:
request_start = time.perf_counter()
route = get_route(request.app.state.bot_settings, routing_key) route = get_route(request.app.state.bot_settings, routing_key)
if not route: if not route:
logging.error(f"unknown routing key {routing_key}") logging.error(f"unknown routing key {routing_key}")
WEBHOOK_EVENTS_TOTAL.labels(source="unknown", result="unknown_route").inc()
raise HTTPException( raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Unknown routing key" status_code=status.HTTP_404_NOT_FOUND, detail="Unknown routing key"
) )
source = source_label(route.hook_type)
handler: Optional[Tuple[Authorizer, ParseHandler]] = handlers.get(route.hook_type) handler: Optional[Tuple[Authorizer, ParseHandler]] = handlers.get(route.hook_type)
if not handler: if not handler:
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="handler_error").inc()
raise HTTPException( raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="Unknown hook type" status_code=status.HTTP_404_NOT_FOUND, detail="Unknown hook type"
) )
@ -153,20 +175,51 @@ async def webhook_handler(
bearer_credentials=bearer_credentials, bearer_credentials=bearer_credentials,
basic_credentials=basic_credentials, basic_credentials=basic_credentials,
): ):
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="auth_failed").inc()
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid credentials" status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid credentials"
) )
payload: Any = await request.json() try:
payload: Any = await request.json()
messages = await parse_handler(route, payload, request=request) except Exception:
for msg_plain, msg_formatted in messages: WEBHOOK_EVENTS_TOTAL.labels(source=source, result="invalid_payload").inc()
await matrix_client.room_send( raise HTTPException(
route.room_id, status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid payload"
msg_plain,
message_formatted=msg_formatted,
) )
try:
messages = await parse_handler(route, payload, request=request)
except Exception as exc:
WEBHOOK_EVENTS_TOTAL.labels(
source=source, result=classify_payload_error(exc)
).inc()
raise
first_send_attempt_observed = False
for msg_plain, msg_formatted in messages:
if not first_send_attempt_observed:
EVENT_TO_SEND_SECONDS.labels(source=source).observe(
time.perf_counter() - request_start
)
first_send_attempt_observed = True
try:
await matrix_client.room_send(
route.room_id,
msg_plain,
message_formatted=msg_formatted,
)
except Exception as exc:
MESSAGE_SEND_FAILURES_TOTAL.labels(
source=source, reason=classify_send_failure(exc)
).inc()
raise HTTPException(
status_code=status.HTTP_502_BAD_GATEWAY,
detail="Failed to send message to Matrix",
)
MESSAGES_SENT_TOTAL.labels(source=source).inc()
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="accepted").inc()
return {"status": "ok"} return {"status": "ok"}

View file

@ -10,6 +10,12 @@ from nio import AsyncClient, AsyncClientConfig, LoginResponse
from pydantic import BaseModel from pydantic import BaseModel
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
from ops_bot.metrics import (
MATRIX_AUTH_TOTAL,
MATRIX_SYNC_ERRORS_TOTAL,
classify_sync_error,
)
class ClientCredentials(BaseModel): class ClientCredentials(BaseModel):
homeserver: str homeserver: str
@ -95,7 +101,11 @@ class MatrixClient:
await client.join(room) await client.join(room)
await client.joined_rooms() await client.joined_rooms()
await client.sync_forever(timeout=300000, full_state=True) try:
await client.sync_forever(timeout=300000, full_state=True)
except Exception as exc:
MATRIX_SYNC_ERRORS_TOTAL.labels(reason=classify_sync_error(exc)).inc()
raise
def save_credentials(self, resp: LoginResponse, homeserver: str) -> None: def save_credentials(self, resp: LoginResponse, homeserver: str) -> None:
credentials = ClientCredentials( credentials = ClientCredentials(
@ -120,8 +130,10 @@ class MatrixClient:
) )
if isinstance(response, LoginResponse): if isinstance(response, LoginResponse):
MATRIX_AUTH_TOTAL.labels(mode="fresh_login", result="success").inc()
self.save_credentials(response, self.settings.homeserver) self.save_credentials(response, self.settings.homeserver)
else: else:
MATRIX_AUTH_TOTAL.labels(mode="fresh_login", result="failure").inc()
logging.error( logging.error(
f'Login for "{self.settings.user_id}" via homeserver="{self.settings.homeserver}"' f'Login for "{self.settings.user_id}" via homeserver="{self.settings.homeserver}"'
) )
@ -129,22 +141,27 @@ class MatrixClient:
sys.exit(1) sys.exit(1)
async def login_with_credentials(self) -> None: async def login_with_credentials(self) -> None:
credentials = self.credential_store.read() try:
credentials = self.credential_store.read()
self.client = AsyncClient( self.client = AsyncClient(
homeserver=credentials.homeserver, homeserver=credentials.homeserver,
user=credentials.user_id, user=credentials.user_id,
device_id=credentials.device_id, device_id=credentials.device_id,
store_path=str(self.store_path), store_path=str(self.store_path),
config=self.client_config, config=self.client_config,
ssl=True, ssl=True,
) )
self.client.restore_login( self.client.restore_login(
user_id=credentials.user_id, user_id=credentials.user_id,
device_id=credentials.device_id, device_id=credentials.device_id,
access_token=credentials.access_token, access_token=credentials.access_token,
) )
except Exception:
MATRIX_AUTH_TOTAL.labels(mode="credential_restore", result="failure").inc()
raise
MATRIX_AUTH_TOTAL.labels(mode="credential_restore", result="success").inc()
async def login(self) -> None: async def login(self) -> None:
if self.credential_store.exists(): if self.credential_store.exists():
@ -171,12 +188,14 @@ class MatrixClient:
message_formatted, extensions=["extra"] message_formatted, extensions=["extra"]
) )
await self.client.room_send( response = await self.client.room_send(
room_id=room, room_id=room,
message_type="m.room.message", message_type="m.room.message",
content=content, content=content,
ignore_unverified_devices=True, ignore_unverified_devices=True,
) )
if response.__class__.__name__.endswith("Error"):
raise RuntimeError(f"Matrix room_send failed: {response}")
async def shutdown(self) -> None: async def shutdown(self) -> None:
if self.client is not None: if self.client is not None:

97
ops_bot/metrics.py Normal file
View file

@ -0,0 +1,97 @@
import json
from typing import Optional
from prometheus_client import Counter, Histogram
WEBHOOK_EVENTS_TOTAL = Counter(
"matrix_ops_bot_webhook_events_total",
"Incoming webhook events by source and processing result.",
["source", "result"],
)
MESSAGES_SENT_TOTAL = Counter(
"matrix_ops_bot_messages_sent_total",
"Messages successfully sent to Matrix by source.",
["source"],
)
MESSAGE_SEND_FAILURES_TOTAL = Counter(
"matrix_ops_bot_message_send_failures_total",
"Failures while sending messages to Matrix by source and reason.",
["source", "reason"],
)
MATRIX_AUTH_TOTAL = Counter(
"matrix_ops_bot_matrix_auth_total",
"Matrix authentication and credential-restore attempts by mode and result.",
["mode", "result"],
)
MATRIX_SYNC_ERRORS_TOTAL = Counter(
"matrix_ops_bot_matrix_sync_errors_total",
"Matrix sync loop errors by coarse reason.",
["reason"],
)
EVENT_TO_SEND_SECONDS = Histogram(
"matrix_ops_bot_event_to_send_seconds",
"Time from webhook receipt to first Matrix send attempt by source.",
["source"],
)
CONFIG_LOADED_TOTAL = Counter(
"matrix_ops_bot_config_loaded_total",
"Bot config load outcomes at startup.",
["result"],
)
def source_label(source: Optional[str]) -> str:
if source is None:
return "unknown"
normalized = source.strip().lower().replace("-", "_")
if normalized in {"gitlab", "pagerduty", "aws_sns", "alertmanager"}:
return normalized
return "unknown"
def classify_send_failure(exc: Exception) -> str:
msg = f"{type(exc).__name__} {exc}".lower()
if "forbidden" in msg or "403" in msg:
return "forbidden"
if "rate limit" in msg or "too many requests" in msg or "429" in msg:
return "ratelimit"
if "unknown room" in msg or "room not found" in msg:
return "unknown_room"
if (
"timeout" in msg
or "connection" in msg
or "network" in msg
or "dns" in msg
or "refused" in msg
):
return "network"
return "exception"
def classify_sync_error(exc: Exception) -> str:
msg = f"{type(exc).__name__} {exc}".lower()
if "401" in msg or "403" in msg or "unauthorized" in msg or "forbidden" in msg:
return "auth"
if (
"timeout" in msg
or "connection" in msg
or "network" in msg
or "dns" in msg
or "refused" in msg
):
return "network"
if "megolm" in msg or "olm" in msg or "decrypt" in msg:
return "crypto"
return "unknown"
def classify_payload_error(exc: Exception) -> str:
if isinstance(exc, (ValueError, TypeError, KeyError, json.JSONDecodeError)):
return "invalid_payload"
return "handler_error"