Add operational metrics for webhook and Matrix health
This commit is contained in:
parent
2fde5ffc87
commit
0056f819b6
3 changed files with 195 additions and 26 deletions
97
ops_bot/metrics.py
Normal file
97
ops_bot/metrics.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import json
|
||||
from typing import Optional
|
||||
|
||||
from prometheus_client import Counter, Histogram
|
||||
|
||||
WEBHOOK_EVENTS_TOTAL = Counter(
|
||||
"matrix_ops_bot_webhook_events_total",
|
||||
"Incoming webhook events by source and processing result.",
|
||||
["source", "result"],
|
||||
)
|
||||
|
||||
MESSAGES_SENT_TOTAL = Counter(
|
||||
"matrix_ops_bot_messages_sent_total",
|
||||
"Messages successfully sent to Matrix by source.",
|
||||
["source"],
|
||||
)
|
||||
|
||||
MESSAGE_SEND_FAILURES_TOTAL = Counter(
|
||||
"matrix_ops_bot_message_send_failures_total",
|
||||
"Failures while sending messages to Matrix by source and reason.",
|
||||
["source", "reason"],
|
||||
)
|
||||
|
||||
MATRIX_AUTH_TOTAL = Counter(
|
||||
"matrix_ops_bot_matrix_auth_total",
|
||||
"Matrix authentication and credential-restore attempts by mode and result.",
|
||||
["mode", "result"],
|
||||
)
|
||||
|
||||
MATRIX_SYNC_ERRORS_TOTAL = Counter(
|
||||
"matrix_ops_bot_matrix_sync_errors_total",
|
||||
"Matrix sync loop errors by coarse reason.",
|
||||
["reason"],
|
||||
)
|
||||
|
||||
EVENT_TO_SEND_SECONDS = Histogram(
|
||||
"matrix_ops_bot_event_to_send_seconds",
|
||||
"Time from webhook receipt to first Matrix send attempt by source.",
|
||||
["source"],
|
||||
)
|
||||
|
||||
CONFIG_LOADED_TOTAL = Counter(
|
||||
"matrix_ops_bot_config_loaded_total",
|
||||
"Bot config load outcomes at startup.",
|
||||
["result"],
|
||||
)
|
||||
|
||||
|
||||
def source_label(source: Optional[str]) -> str:
|
||||
if source is None:
|
||||
return "unknown"
|
||||
normalized = source.strip().lower().replace("-", "_")
|
||||
if normalized in {"gitlab", "pagerduty", "aws_sns", "alertmanager"}:
|
||||
return normalized
|
||||
return "unknown"
|
||||
|
||||
|
||||
def classify_send_failure(exc: Exception) -> str:
|
||||
msg = f"{type(exc).__name__} {exc}".lower()
|
||||
if "forbidden" in msg or "403" in msg:
|
||||
return "forbidden"
|
||||
if "rate limit" in msg or "too many requests" in msg or "429" in msg:
|
||||
return "ratelimit"
|
||||
if "unknown room" in msg or "room not found" in msg:
|
||||
return "unknown_room"
|
||||
if (
|
||||
"timeout" in msg
|
||||
or "connection" in msg
|
||||
or "network" in msg
|
||||
or "dns" in msg
|
||||
or "refused" in msg
|
||||
):
|
||||
return "network"
|
||||
return "exception"
|
||||
|
||||
|
||||
def classify_sync_error(exc: Exception) -> str:
|
||||
msg = f"{type(exc).__name__} {exc}".lower()
|
||||
if "401" in msg or "403" in msg or "unauthorized" in msg or "forbidden" in msg:
|
||||
return "auth"
|
||||
if (
|
||||
"timeout" in msg
|
||||
or "connection" in msg
|
||||
or "network" in msg
|
||||
or "dns" in msg
|
||||
or "refused" in msg
|
||||
):
|
||||
return "network"
|
||||
if "megolm" in msg or "olm" in msg or "decrypt" in msg:
|
||||
return "crypto"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def classify_payload_error(exc: Exception) -> str:
|
||||
if isinstance(exc, (ValueError, TypeError, KeyError, json.JSONDecodeError)):
|
||||
return "invalid_payload"
|
||||
return "handler_error"
|
||||
Loading…
Add table
Add a link
Reference in a new issue