Add operational metrics for webhook and Matrix health
This commit is contained in:
parent
2fde5ffc87
commit
0056f819b6
3 changed files with 195 additions and 26 deletions
|
|
@ -2,6 +2,7 @@ import asyncio
|
|||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, AsyncIterator, Dict, List, Optional, Protocol, Tuple, cast
|
||||
|
||||
|
|
@ -20,6 +21,16 @@ from ops_bot import alertmanager, aws, pagerduty
|
|||
from ops_bot.config import BotSettings, RoutingKey, load_config
|
||||
from ops_bot.gitlab import hook as gitlab_hook
|
||||
from ops_bot.matrix import MatrixClient
|
||||
from ops_bot.metrics import (
|
||||
CONFIG_LOADED_TOTAL,
|
||||
EVENT_TO_SEND_SECONDS,
|
||||
MESSAGES_SENT_TOTAL,
|
||||
MESSAGE_SEND_FAILURES_TOTAL,
|
||||
WEBHOOK_EVENTS_TOTAL,
|
||||
classify_payload_error,
|
||||
classify_send_failure,
|
||||
source_label,
|
||||
)
|
||||
|
||||
|
||||
async def get_matrix_service(request: Request) -> MatrixClient:
|
||||
|
|
@ -36,8 +47,15 @@ async def matrix_main(matrix_client: MatrixClient) -> None:
|
|||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
||||
config_fname = os.environ.get("BOT_CONFIG_FILE", "config.json")
|
||||
bot_settings = load_config(config_fname)
|
||||
c = MatrixClient(settings=bot_settings.matrix, join_rooms=bot_settings.get_rooms())
|
||||
try:
|
||||
bot_settings = load_config(config_fname)
|
||||
c = MatrixClient(
|
||||
settings=bot_settings.matrix, join_rooms=bot_settings.get_rooms()
|
||||
)
|
||||
except Exception:
|
||||
CONFIG_LOADED_TOTAL.labels(result="failure").inc()
|
||||
raise
|
||||
CONFIG_LOADED_TOTAL.labels(result="success").inc()
|
||||
app.state.matrix_client = c
|
||||
app.state.bot_settings = bot_settings
|
||||
asyncio.create_task(matrix_main(c))
|
||||
|
|
@ -131,16 +149,20 @@ async def webhook_handler(
|
|||
),
|
||||
matrix_client: MatrixClient = Depends(get_matrix_service),
|
||||
) -> Dict[str, str]:
|
||||
request_start = time.perf_counter()
|
||||
route = get_route(request.app.state.bot_settings, routing_key)
|
||||
|
||||
if not route:
|
||||
logging.error(f"unknown routing key {routing_key}")
|
||||
WEBHOOK_EVENTS_TOTAL.labels(source="unknown", result="unknown_route").inc()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND, detail="Unknown routing key"
|
||||
)
|
||||
|
||||
source = source_label(route.hook_type)
|
||||
handler: Optional[Tuple[Authorizer, ParseHandler]] = handlers.get(route.hook_type)
|
||||
if not handler:
|
||||
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="handler_error").inc()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND, detail="Unknown hook type"
|
||||
)
|
||||
|
|
@ -153,20 +175,51 @@ async def webhook_handler(
|
|||
bearer_credentials=bearer_credentials,
|
||||
basic_credentials=basic_credentials,
|
||||
):
|
||||
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="auth_failed").inc()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid credentials"
|
||||
)
|
||||
|
||||
payload: Any = await request.json()
|
||||
|
||||
messages = await parse_handler(route, payload, request=request)
|
||||
for msg_plain, msg_formatted in messages:
|
||||
await matrix_client.room_send(
|
||||
route.room_id,
|
||||
msg_plain,
|
||||
message_formatted=msg_formatted,
|
||||
try:
|
||||
payload: Any = await request.json()
|
||||
except Exception:
|
||||
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="invalid_payload").inc()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid payload"
|
||||
)
|
||||
|
||||
try:
|
||||
messages = await parse_handler(route, payload, request=request)
|
||||
except Exception as exc:
|
||||
WEBHOOK_EVENTS_TOTAL.labels(
|
||||
source=source, result=classify_payload_error(exc)
|
||||
).inc()
|
||||
raise
|
||||
|
||||
first_send_attempt_observed = False
|
||||
for msg_plain, msg_formatted in messages:
|
||||
if not first_send_attempt_observed:
|
||||
EVENT_TO_SEND_SECONDS.labels(source=source).observe(
|
||||
time.perf_counter() - request_start
|
||||
)
|
||||
first_send_attempt_observed = True
|
||||
try:
|
||||
await matrix_client.room_send(
|
||||
route.room_id,
|
||||
msg_plain,
|
||||
message_formatted=msg_formatted,
|
||||
)
|
||||
except Exception as exc:
|
||||
MESSAGE_SEND_FAILURES_TOTAL.labels(
|
||||
source=source, reason=classify_send_failure(exc)
|
||||
).inc()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_502_BAD_GATEWAY,
|
||||
detail="Failed to send message to Matrix",
|
||||
)
|
||||
MESSAGES_SENT_TOTAL.labels(source=source).inc()
|
||||
|
||||
WEBHOOK_EVENTS_TOTAL.labels(source=source, result="accepted").inc()
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue