alarms: refactor the alarms subsystem

also include eotk alarms now
This commit is contained in:
Iain Learmonth 2022-05-18 15:49:36 +01:00
parent a935055083
commit e2ce24bf3b
17 changed files with 288 additions and 152 deletions

View file

@ -0,0 +1,54 @@
from typing import Tuple, Optional
import boto3
from sqlalchemy import func
from app import app
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.base import Group
from app.models.alarms import AlarmState
from app.models.onions import Eotk
from app.terraform import BaseAutomation
def alarms_in_region(region: str, prefix: str, aspect: str) -> None:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name=region)
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix=prefix)
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
eotk_id = cw_alarm["AlarmName"][len(prefix):].split("-")
group: Optional[Group] = Group.query.filter(func.lower(Group.group_name) == eotk_id[1]).first()
if group is None:
print("Unable to find group for " + cw_alarm['AlarmName'])
continue
eotk = Eotk.query.filter(
Eotk.group_id == group.id,
Eotk.region == region
).first()
if eotk is None:
print("Skipping unknown instance " + cw_alarm['AlarmName'])
continue
alarm = get_or_create_alarm(eotk.brn, aspect)
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
class AlarmEotkAwsAutomation(BaseAutomation):
short_name = "monitor_eotk_aws"
description = "Import alarms for AWS EOTK instances"
def automate(self, full: bool = False) -> Tuple[bool, str]:
for region in ["us-east-2", "eu-central-1"]:
alarms_in_region(region, "eotk-bw-out-high-", "bandwidth-out-high")
alarms_in_region(region, "eotk-cpu-high-", "instance-cpu")
db.session.commit()
return True, ""

View file

@ -4,7 +4,7 @@ from azure.identity import ClientSecretCredential
from azure.mgmt.alertsmanagement import AlertsManagementClient
from app import app
from app.alarms import get_proxy_alarm
from app.alarms import get_or_create_alarm
from app.models.alarms import AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
@ -30,7 +30,7 @@ class AlarmProxyAzureCdnAutomation(BaseAutomation):
Proxy.provider == "azure_cdn",
Proxy.destroyed.is_(None)
):
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
alarm = get_or_create_alarm(proxy.brn, "bandwidth-out-high")
if proxy.origin.group.group_name.lower() not in firing:
alarm.update_state(AlarmState.OK, "Azure monitor alert not firing")
else:

View file

@ -2,60 +2,62 @@ import datetime
from typing import Tuple
import boto3
from flask import current_app
from app import app
from app.alarms import get_proxy_alarm
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.mirrors import Proxy
from app.models.alarms import AlarmState, Alarm
from app.models.alarms import AlarmState
from app.terraform import BaseAutomation
def _cloudfront_quota() -> None:
alarm = get_or_create_alarm(
f"brn:{current_app.config['GLOBAL_NAMESPACE']}:0:mirror:cloudfront:quota/distributions",
"quota-usage"
)
alarm.last_updated = datetime.datetime.utcnow()
deployed_count = len(Proxy.query.filter(
Proxy.destroyed.is_(None)).all())
message = f"{deployed_count} distributions deployed"
if deployed_count > 370:
alarm.update_state(AlarmState.CRITICAL, message)
elif deployed_count > 320:
alarm.update_state(AlarmState.WARNING, message)
else:
alarm.update_state(AlarmState.OK, message)
def _proxy_alarms() -> None:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name='us-east-2')
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix="bandwidth-out-high-")
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
dist_id = cw_alarm["AlarmName"][len("bandwidth-out-high-"):]
proxy = Proxy.query.filter(Proxy.slug == dist_id).first()
if proxy is None:
print("Skipping unknown proxy " + dist_id)
continue
alarm = get_or_create_alarm(proxy.brn, "bandwidth-out-high")
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
class AlarmProxyCloudfrontAutomation(BaseAutomation):
short_name = "monitor_proxy_cloudfront"
description = "Import alarms for AWS CloudFront proxies"
def automate(self, full: bool = False) -> Tuple[bool, str]:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name='us-east-2')
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix="bandwidth-out-high-")
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
dist_id = cw_alarm["AlarmName"][len("bandwidth-out-high-"):]
proxy = Proxy.query.filter(Proxy.slug == dist_id).first()
if proxy is None:
print("Skipping unknown proxy " + dist_id)
continue
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
alarm = Alarm.query.filter(
Alarm.alarm_type == "cloudfront-quota"
).first()
if alarm is None:
alarm = Alarm() # type: ignore
alarm.target = "service/cloudfront"
alarm.alarm_type = "cloudfront-quota"
alarm.state_changed = datetime.datetime.utcnow()
db.session.add(alarm)
alarm.last_updated = datetime.datetime.utcnow()
deployed_count = len(Proxy.query.filter(
Proxy.destroyed.is_(None)).all())
old_state = alarm.alarm_state
if deployed_count > 370:
alarm.alarm_state = AlarmState.CRITICAL
elif deployed_count > 320:
alarm.alarm_state = AlarmState.WARNING
else:
alarm.alarm_state = AlarmState.OK
if alarm.alarm_state != old_state:
alarm.state_changed = datetime.datetime.utcnow()
_proxy_alarms()
_cloudfront_quota()
db.session.commit()
return True, ""

View file

@ -3,26 +3,13 @@ from typing import Tuple
import requests
from requests import RequestException
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.alarms import Alarm, AlarmState
from app.models.alarms import AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
def set_http_alarm(proxy_id: int, state: AlarmState, text: str) -> None:
alarm = Alarm.query.filter(
Alarm.proxy_id == proxy_id,
Alarm.alarm_type == "http-status"
).first()
if alarm is None:
alarm = Alarm()
alarm.proxy_id = proxy_id
alarm.alarm_type = "http-status"
alarm.target = "proxy"
db.session.add(alarm)
alarm.update_state(state, text)
class AlarmProxyHTTPStatusAutomation(BaseAutomation):
short_name = "alarm_http_status"
description = "Check all deployed proxies for HTTP status code"
@ -40,28 +27,26 @@ class AlarmProxyHTTPStatusAutomation(BaseAutomation):
allow_redirects=False,
timeout=5)
r.raise_for_status()
alarm = get_or_create_alarm(proxy.brn, "http-status")
if r.is_redirect:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
else:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.OK,
f"{r.status_code} {r.reason}"
)
except requests.HTTPError:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
except RequestException as e:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
repr(e)
)
db.session.commit()
return True, ""