alarms: refactor the alarms subsystem

also include eotk alarms now
This commit is contained in:
Iain Learmonth 2022-05-18 15:49:36 +01:00
parent a935055083
commit e2ce24bf3b
17 changed files with 288 additions and 152 deletions

View file

@ -1,48 +1,40 @@
import datetime
from typing import Optional
from typing import Optional, List
from app.extensions import db
from app.models.alarms import Alarm
def alarms_for(target: str) -> List[Alarm]:
return list(Alarm.query.filter(
Alarm.target == target
).all())
def _get_alarm(target: str,
alarm_type: str,
*,
proxy_id: Optional[int] = None,
origin_id: Optional[int] = None,
aspect: str,
create_if_missing: bool = True) -> Optional[Alarm]:
alarm: Optional[Alarm]
if target == "proxy":
alarm = Alarm.query.filter(
Alarm.target == "proxy",
Alarm.alarm_type == alarm_type,
Alarm.proxy_id == proxy_id
).first()
elif target == "origin":
alarm = Alarm.query.filter(
Alarm.target == "origin",
Alarm.alarm_type == alarm_type,
Alarm.proxy_id == origin_id
).first()
else:
return None
alarm: Optional[Alarm] = Alarm.query.filter(
Alarm.aspect == aspect,
Alarm.target == target
).first()
if create_if_missing and alarm is None:
alarm = Alarm()
alarm.aspect = aspect
alarm.target = target
alarm.alarm_type = alarm_type
alarm.text = "New alarm"
alarm.state_changed = datetime.datetime.utcnow()
if target == "proxy":
alarm.proxy_id = proxy_id
if target == "origin":
alarm.origin_id = origin_id
alarm.last_updated = datetime.datetime.utcnow()
db.session.add(alarm)
db.session.commit()
return alarm
def get_proxy_alarm(proxy_id: int, alarm_type: str) -> Alarm:
alarm = _get_alarm("proxy", alarm_type, proxy_id=proxy_id)
def get_alarm(target: str, aspect: str) -> Optional[Alarm]:
return _get_alarm(target, aspect, create_if_missing=False)
def get_or_create_alarm(target: str, aspect: str) -> Alarm:
alarm = _get_alarm(target, aspect, create_if_missing=True)
if alarm is None:
# mypy can't tell that this will never be reached
raise RuntimeError("Creating an alarm must have failed.")
raise RuntimeError("Asked for an alarm to be created but got None.")
return alarm

View file

@ -15,6 +15,7 @@ from app.terraform.block_external import BlockExternalAutomation
from app.terraform.block_ooni import BlockOONIAutomation
from app.terraform.block_roskomsvoboda import BlockRoskomsvobodaAutomation
from app.terraform.eotk.aws import EotkAWSAutomation
from app.terraform.alarms.eotk_aws import AlarmEotkAwsAutomation
from app.terraform.alarms.proxy_azure_cdn import AlarmProxyAzureCdnAutomation
from app.terraform.alarms.proxy_cloudfront import AlarmProxyCloudfrontAutomation
from app.terraform.alarms.proxy_http_status import AlarmProxyHTTPStatusAutomation
@ -37,6 +38,7 @@ else:
jobs = {
x.short_name: x
for x in [
AlarmEotkAwsAutomation,
AlarmProxyAzureCdnAutomation,
AlarmProxyCloudfrontAutomation,
AlarmProxyHTTPStatusAutomation,

View file

@ -1,7 +1,10 @@
from abc import abstractmethod
from datetime import datetime
from typing import Union, List, Optional, Any
from app.alarms import alarms_for
from app.extensions import db
from app.models.alarms import Alarm
class AbstractConfiguration(db.Model): # type: ignore
@ -13,6 +16,15 @@ class AbstractConfiguration(db.Model): # type: ignore
updated = db.Column(db.DateTime(), default=datetime.utcnow, nullable=False)
destroyed = db.Column(db.DateTime(), nullable=True)
@property
def alarms(self) -> List[Alarm]:
return alarms_for(self.brn)
@property
@abstractmethod
def brn(self) -> str:
raise NotImplementedError()
def destroy(self) -> None:
self.destroyed = datetime.utcnow()
self.updated = datetime.utcnow()
@ -59,6 +71,11 @@ class AbstractResource(db.Model): # type: ignore
if self.updated is None:
self.updated = datetime.utcnow()
@property
@abstractmethod
def brn(self) -> str:
raise NotImplementedError()
def deprecate(self, *, reason: str) -> None:
self.deprecated = datetime.utcnow()
self.deprecation_reason = reason

View file

@ -14,38 +14,30 @@ class AlarmState(enum.Enum):
class Alarm(db.Model): # type: ignore
id = db.Column(db.Integer, primary_key=True)
target = db.Column(db.String(60), nullable=False)
group_id = db.Column(db.Integer, db.ForeignKey("group.id"))
origin_id = db.Column(db.Integer, db.ForeignKey("origin.id"))
proxy_id = db.Column(db.Integer, db.ForeignKey("proxy.id"))
bridge_id = db.Column(db.Integer, db.ForeignKey("bridge.id"))
alarm_type = db.Column(db.String(255), nullable=False)
target = db.Column(db.String(255), nullable=False)
aspect = db.Column(db.String(255), nullable=False)
alarm_state = db.Column(db.Enum(AlarmState), default=AlarmState.UNKNOWN, nullable=False)
state_changed = db.Column(db.DateTime(), nullable=False)
last_updated = db.Column(db.DateTime())
text = db.Column(db.String(255))
group = db.relationship("Group", back_populates="alarms")
origin = db.relationship("Origin", back_populates="alarms")
proxy = db.relationship("Proxy", back_populates="alarms")
bridge = db.relationship("Bridge", back_populates="alarms")
last_updated = db.Column(db.DateTime(), nullable=False)
text = db.Column(db.String(255), nullable=False)
@classmethod
def csv_header(cls) -> List[str]:
return [
"id", "target", "group_id", "origin_id", "proxy_id", "bridge_id", "alarm_type",
"alarm_state", "state_changed", "last_updated", "text"
]
return ["id", "target", "alarm_type", "alarm_state", "state_changed", "last_updated", "text"]
def csv_row(self) -> List[Any]:
return [
getattr(self, x) for x in self.csv_header()
]
return [getattr(self, x) for x in self.csv_header()]
def update_state(self, state: AlarmState, text: str) -> None:
from app.models.activity import Activity
if self.alarm_state != state or self.state_changed is None:
self.state_changed = datetime.utcnow()
activity = Activity(activity_type="alarm_state",
text=f"{self.alarm_state.name}->{state.name}! State changed for "
f"{self.aspect} on {self.target}: {text}")
activity.notify()
db.session.add(activity)
self.alarm_state = state
self.text = text
self.last_updated = datetime.utcnow()
db.session.commit()

View file

@ -13,7 +13,6 @@ class Group(AbstractConfiguration):
bridgeconfs = db.relationship("BridgeConf", back_populates="group")
eotks = db.relationship("Eotk", back_populates="group")
onions = db.relationship("Onion", back_populates="group")
alarms = db.relationship("Alarm", back_populates="group")
@classmethod
def csv_header(cls) -> List[str]:

View file

@ -39,7 +39,6 @@ class Bridge(AbstractResource):
bridgeline = db.Column(db.String(255), nullable=True)
conf = db.relationship("BridgeConf", back_populates="bridges")
alarms = db.relationship("Alarm", back_populates="bridge")
@classmethod
def csv_header(cls) -> List[str]:

View file

@ -1,5 +1,6 @@
from typing import Optional, List
from flask import current_app
from tldextract import extract
from app.extensions import db
@ -14,7 +15,10 @@ class Origin(AbstractConfiguration):
group = db.relationship("Group", back_populates="origins")
proxies = db.relationship("Proxy", back_populates="origin")
alarms = db.relationship("Alarm", back_populates="origin")
@property
def brn(self) -> str:
return f"brn:{current_app.config['GLOBAL_NAMESPACE']}:{self.group_id}:mirror:conf:origin/{self.domain_name}"
@classmethod
def csv_header(cls) -> List[str]:
@ -45,7 +49,10 @@ class Proxy(AbstractResource):
url = db.Column(db.String(255), nullable=True)
origin = db.relationship("Origin", back_populates="proxies")
alarms = db.relationship("Alarm", back_populates="proxy")
@property
def brn(self) -> str:
return f"brn:{current_app.config['GLOBAL_NAMESPACE']}:{self.origin.group_id}:mirror:{self.provider}:proxy/{self.id}"
@classmethod
def csv_header(cls) -> List[str]:

View file

@ -1,3 +1,5 @@
from flask import current_app
from app.extensions import db
from app.models import AbstractConfiguration, AbstractResource
@ -17,3 +19,7 @@ class Eotk(AbstractResource):
region = db.Column(db.String(20), nullable=False)
group = db.relationship("Group", back_populates="eotks")
@property
def brn(self) -> str:
return f"brn:{current_app.config['GLOBAL_NAMESPACE']}:{self.group_id}:eotk:{self.provider}:instance/{self.region}"

View file

@ -1,8 +1,9 @@
from datetime import datetime, timedelta, timezone
from typing import Optional
from typing import Optional, Union
from flask import Blueprint, render_template, request
from flask.typing import ResponseReturnValue
from jinja2 import Markup
from sqlalchemy import desc, or_
from app.models.activity import Activity
@ -10,6 +11,7 @@ from app.models.alarms import Alarm, AlarmState
from app.models.bridges import Bridge
from app.models.mirrors import Origin, Proxy
from app.models.base import Group
from app.models.onions import Eotk
from app.portal.automation import bp as automation
from app.portal.bridgeconf import bp as bridgeconf
from app.portal.bridge import bp as bridge
@ -50,11 +52,44 @@ def format_datetime(s: Optional[datetime]) -> str:
return s.strftime("%a, %d %b %Y %H:%M:%S")
@portal.app_template_filter("describe_brn")
def describe_brn(s: str) -> Union[str, Markup]:
parts = s.split(":")
if parts[3] == "mirror":
if parts[5].startswith("origin/"):
origin = Origin.query.filter(
Origin.domain_name == parts[5][len("origin/"):]
).first()
if not origin:
return s
return f"Origin: {origin.domain_name} ({origin.group.group_name})"
if parts[5].startswith("proxy/"):
proxy = Proxy.query.filter(
Proxy.id == int(parts[5][len("proxy/"):])
).first()
if not proxy:
return s
return Markup(f"Proxy: {proxy.url}<br>({proxy.origin.group.group_name}: {proxy.origin.domain_name})") # type: ignore
if parts[5].startswith("quota/"):
if parts[4] == "cloudfront":
return f"Quota: CloudFront {parts[5][len('quota/'):]}"
if parts[3] == "eotk":
if parts[5].startswith("instance/"):
eotk = Eotk.query.filter(
Eotk.group_id == parts[2],
Eotk.region == parts[5][len("instance/"):]
).first()
if not eotk:
return s
return f"EOTK Instance: {eotk.group.group_name} in {eotk.provider} {eotk.region}"
return s
def total_origins_blocked() -> int:
count = 0
for o in Origin.query.filter(Origin.destroyed.is_(None)).all():
for a in o.alarms:
if a.alarm_type.startswith("origin-block-ooni-"):
if a.aspect.startswith("origin-block-ooni-"):
if a.alarm_state == AlarmState.WARNING:
count += 1
break

View file

@ -28,7 +28,7 @@
<thead>
<tr>
<th scope="col">Resource</th>
<th scope="col">Type</th>
<th scope="col">Aspect</th>
<th scope="col">State</th>
<th scope="col">Message</th>
<th scope="col">Last Update</th>
@ -37,14 +37,8 @@
<tbody>
{% for alarm in alarms %}
<tr class="bg-{% if alarm.alarm_state.name == "OK" %}success{% elif alarm.alarm_state.name == "UNKNOWN" %}dark{% elif alarm.alarm_state.name == "WARNING" %}warning{% else %}danger{% endif %} text-{% if alarm.alarm_state.name == "WARNING" %}dark{% else %}light{% endif %}">
{% if alarm.target == "proxy" %}
<td>Proxy: {{ alarm.proxy.url }}<br />({{ alarm.proxy.origin.domain_name }})</td>
{% elif alarm.target == "origin" %}
<td>Origin: {{ alarm.origin.domain_name }}</td>
{% elif alarm.target == "service/cloudfront" %}
<td>AWS CloudFront</td>
{% endif %}
<td>{{ alarm.alarm_type }}</td>
<td>{{ alarm.target | describe_brn }}</td>
<td>{{ alarm.aspect }}</td>
<td>{{ alarm.alarm_state.name }}</td>
<td>{{ alarm.text }}</td>
<td>{{ alarm.last_updated | format_datetime }}</td>

View file

@ -0,0 +1,54 @@
from typing import Tuple, Optional
import boto3
from sqlalchemy import func
from app import app
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.base import Group
from app.models.alarms import AlarmState
from app.models.onions import Eotk
from app.terraform import BaseAutomation
def alarms_in_region(region: str, prefix: str, aspect: str) -> None:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name=region)
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix=prefix)
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
eotk_id = cw_alarm["AlarmName"][len(prefix):].split("-")
group: Optional[Group] = Group.query.filter(func.lower(Group.group_name) == eotk_id[1]).first()
if group is None:
print("Unable to find group for " + cw_alarm['AlarmName'])
continue
eotk = Eotk.query.filter(
Eotk.group_id == group.id,
Eotk.region == region
).first()
if eotk is None:
print("Skipping unknown instance " + cw_alarm['AlarmName'])
continue
alarm = get_or_create_alarm(eotk.brn, aspect)
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
class AlarmEotkAwsAutomation(BaseAutomation):
short_name = "monitor_eotk_aws"
description = "Import alarms for AWS EOTK instances"
def automate(self, full: bool = False) -> Tuple[bool, str]:
for region in ["us-east-2", "eu-central-1"]:
alarms_in_region(region, "eotk-bw-out-high-", "bandwidth-out-high")
alarms_in_region(region, "eotk-cpu-high-", "instance-cpu")
db.session.commit()
return True, ""

View file

@ -4,7 +4,7 @@ from azure.identity import ClientSecretCredential
from azure.mgmt.alertsmanagement import AlertsManagementClient
from app import app
from app.alarms import get_proxy_alarm
from app.alarms import get_or_create_alarm
from app.models.alarms import AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
@ -30,7 +30,7 @@ class AlarmProxyAzureCdnAutomation(BaseAutomation):
Proxy.provider == "azure_cdn",
Proxy.destroyed.is_(None)
):
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
alarm = get_or_create_alarm(proxy.brn, "bandwidth-out-high")
if proxy.origin.group.group_name.lower() not in firing:
alarm.update_state(AlarmState.OK, "Azure monitor alert not firing")
else:

View file

@ -2,60 +2,62 @@ import datetime
from typing import Tuple
import boto3
from flask import current_app
from app import app
from app.alarms import get_proxy_alarm
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.mirrors import Proxy
from app.models.alarms import AlarmState, Alarm
from app.models.alarms import AlarmState
from app.terraform import BaseAutomation
def _cloudfront_quota() -> None:
alarm = get_or_create_alarm(
f"brn:{current_app.config['GLOBAL_NAMESPACE']}:0:mirror:cloudfront:quota/distributions",
"quota-usage"
)
alarm.last_updated = datetime.datetime.utcnow()
deployed_count = len(Proxy.query.filter(
Proxy.destroyed.is_(None)).all())
message = f"{deployed_count} distributions deployed"
if deployed_count > 370:
alarm.update_state(AlarmState.CRITICAL, message)
elif deployed_count > 320:
alarm.update_state(AlarmState.WARNING, message)
else:
alarm.update_state(AlarmState.OK, message)
def _proxy_alarms() -> None:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name='us-east-2')
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix="bandwidth-out-high-")
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
dist_id = cw_alarm["AlarmName"][len("bandwidth-out-high-"):]
proxy = Proxy.query.filter(Proxy.slug == dist_id).first()
if proxy is None:
print("Skipping unknown proxy " + dist_id)
continue
alarm = get_or_create_alarm(proxy.brn, "bandwidth-out-high")
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
class AlarmProxyCloudfrontAutomation(BaseAutomation):
short_name = "monitor_proxy_cloudfront"
description = "Import alarms for AWS CloudFront proxies"
def automate(self, full: bool = False) -> Tuple[bool, str]:
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name='us-east-2')
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix="bandwidth-out-high-")
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
dist_id = cw_alarm["AlarmName"][len("bandwidth-out-high-"):]
proxy = Proxy.query.filter(Proxy.slug == dist_id).first()
if proxy is None:
print("Skipping unknown proxy " + dist_id)
continue
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
alarm = Alarm.query.filter(
Alarm.alarm_type == "cloudfront-quota"
).first()
if alarm is None:
alarm = Alarm() # type: ignore
alarm.target = "service/cloudfront"
alarm.alarm_type = "cloudfront-quota"
alarm.state_changed = datetime.datetime.utcnow()
db.session.add(alarm)
alarm.last_updated = datetime.datetime.utcnow()
deployed_count = len(Proxy.query.filter(
Proxy.destroyed.is_(None)).all())
old_state = alarm.alarm_state
if deployed_count > 370:
alarm.alarm_state = AlarmState.CRITICAL
elif deployed_count > 320:
alarm.alarm_state = AlarmState.WARNING
else:
alarm.alarm_state = AlarmState.OK
if alarm.alarm_state != old_state:
alarm.state_changed = datetime.datetime.utcnow()
_proxy_alarms()
_cloudfront_quota()
db.session.commit()
return True, ""

View file

@ -3,26 +3,13 @@ from typing import Tuple
import requests
from requests import RequestException
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.alarms import Alarm, AlarmState
from app.models.alarms import AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
def set_http_alarm(proxy_id: int, state: AlarmState, text: str) -> None:
alarm = Alarm.query.filter(
Alarm.proxy_id == proxy_id,
Alarm.alarm_type == "http-status"
).first()
if alarm is None:
alarm = Alarm()
alarm.proxy_id = proxy_id
alarm.alarm_type = "http-status"
alarm.target = "proxy"
db.session.add(alarm)
alarm.update_state(state, text)
class AlarmProxyHTTPStatusAutomation(BaseAutomation):
short_name = "alarm_http_status"
description = "Check all deployed proxies for HTTP status code"
@ -40,28 +27,26 @@ class AlarmProxyHTTPStatusAutomation(BaseAutomation):
allow_redirects=False,
timeout=5)
r.raise_for_status()
alarm = get_or_create_alarm(proxy.brn, "http-status")
if r.is_redirect:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
else:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.OK,
f"{r.status_code} {r.reason}"
)
except requests.HTTPError:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
except RequestException as e:
set_http_alarm(
proxy.id,
alarm.update_state(
AlarmState.CRITICAL,
repr(e)
)
db.session.commit()
return True, ""

View file

@ -5,8 +5,9 @@ from typing import Dict, Tuple, Any
import requests
from app.alarms import get_or_create_alarm
from app.extensions import db
from app.models.alarms import Alarm, AlarmState
from app.models.alarms import AlarmState
from app.models.mirrors import Origin
from app.terraform import BaseAutomation
@ -58,20 +59,6 @@ def threshold_origin(domain_name: str) -> Dict[str, Any]:
return ooni
def set_ooni_alarm(origin_id: int, country: str, state: AlarmState, text: str) -> None:
alarm = Alarm.query.filter(
Alarm.origin_id == origin_id,
Alarm.alarm_type == f"origin-block-ooni-{country}"
).first()
if alarm is None:
alarm = Alarm()
alarm.origin_id = origin_id
alarm.alarm_type = f"origin-block-ooni-{country}"
alarm.target = "origin"
db.session.add(alarm)
alarm.update_state(state, text)
class BlockOONIAutomation(BaseAutomation):
short_name = "block_ooni"
description = "Import origin and/or proxy reachability results from OONI"
@ -82,5 +69,8 @@ class BlockOONIAutomation(BaseAutomation):
for origin in origins:
ooni = threshold_origin(origin.domain_name)
for country in ooni:
set_ooni_alarm(origin.id, country.lower(), ooni[country]["state"], ooni[country]["message"])
alarm = get_or_create_alarm(origin.brn,
f"origin-block-ooni-{country.lower()}")
alarm.update_state(ooni[country]["state"], ooni[country]["message"])
db.session.commit()
return True, ""

View file

@ -65,8 +65,7 @@ class EotkAWSAutomation(TerraformAutomation):
aws = aws,
aws.second_region = aws.second_region
}
source = "sr2c/eotk/aws"
version = "0.0.5"
source = "/Users/irl/PycharmProjects/bc-dashboard/terraform/terraform-aws-eotk"
namespace = "{{ global_namespace }}"
tenant = "{{ group.group_name }}"
name = "eotk"