automation: establish an automation framework

This commit is contained in:
Iain Learmonth 2022-05-08 17:20:04 +01:00
parent 1b53bf451c
commit 8abe5d60fa
31 changed files with 586 additions and 274 deletions

View file

View file

@ -0,0 +1,36 @@
from azure.identity import ClientSecretCredential
from azure.mgmt.alertsmanagement import AlertsManagementClient
from app import app
from app.alarms import get_proxy_alarm
from app.models.alarms import AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
class AlarmProxyAzureCdnAutomation(BaseAutomation):
short_name = "monitor_proxy_azure_cdn"
description = "Import alarms for Azure CDN proxies"
def automate(self):
credential = ClientSecretCredential(
tenant_id=app.config['AZURE_TENANT_ID'],
client_id=app.config['AZURE_CLIENT_ID'],
client_secret=app.config['AZURE_CLIENT_SECRET'])
client = AlertsManagementClient(
credential,
app.config['AZURE_SUBSCRIPTION_ID']
)
firing = [x.name[len("bandwidth-out-high-bc-"):]
for x in client.alerts.get_all()
if x.name.startswith("bandwidth-out-high-bc-") and x.properties.essentials.monitor_condition == "Fired"]
for proxy in Proxy.query.filter(
Proxy.provider == "azure_cdn",
Proxy.destroyed == None
):
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
if proxy.origin.group.group_name.lower() not in firing:
alarm.update_state(AlarmState.OK, "Azure monitor alert not firing")
else:
alarm.update_state(AlarmState.CRITICAL, "Azure monitor alert firing")
return True, []

View file

@ -0,0 +1,60 @@
import datetime
import boto3
from app import app
from app.alarms import get_proxy_alarm
from app.extensions import db
from app.models.mirrors import Proxy
from app.models.alarms import AlarmState, Alarm
from app.terraform import BaseAutomation
class AlarmProxyCloudfrontAutomation(BaseAutomation):
short_name = "monitor_proxy_cloudfront"
description = "Import alarms for AWS CloudFront proxies"
def automate(self):
cloudwatch = boto3.client('cloudwatch',
aws_access_key_id=app.config['AWS_ACCESS_KEY'],
aws_secret_access_key=app.config['AWS_SECRET_KEY'],
region_name='us-east-2')
dist_paginator = cloudwatch.get_paginator('describe_alarms')
page_iterator = dist_paginator.paginate(AlarmNamePrefix="bandwidth-out-high-")
for page in page_iterator:
for cw_alarm in page['MetricAlarms']:
dist_id = cw_alarm["AlarmName"][len("bandwidth-out-high-"):]
proxy = Proxy.query.filter(Proxy.slug == dist_id).first()
if proxy is None:
print("Skipping unknown proxy " + dist_id)
continue
alarm = get_proxy_alarm(proxy.id, "bandwidth-out-high")
if cw_alarm['StateValue'] == "OK":
alarm.update_state(AlarmState.OK, "CloudWatch alarm OK")
elif cw_alarm['StateValue'] == "ALARM":
alarm.update_state(AlarmState.CRITICAL, "CloudWatch alarm ALARM")
else:
alarm.update_state(AlarmState.UNKNOWN, f"CloudWatch alarm {cw_alarm['StateValue']}")
alarm = Alarm.query.filter(
Alarm.alarm_type == "cloudfront-quota"
).first()
if alarm is None:
alarm = Alarm()
alarm.target = "service/cloudfront"
alarm.alarm_type = "cloudfront-quota"
alarm.state_changed = datetime.datetime.utcnow()
db.session.add(alarm)
alarm.last_updated = datetime.datetime.utcnow()
deployed_count = len(Proxy.query.filter(
Proxy.destroyed == None).all())
old_state = alarm.alarm_state
if deployed_count > 370:
alarm.alarm_state = AlarmState.CRITICAL
elif deployed_count > 320:
alarm.alarm_state = AlarmState.WARNING
else:
alarm.alarm_state = AlarmState.OK
if alarm.alarm_state != old_state:
alarm.state_changed = datetime.datetime.utcnow()
db.session.commit()
return True, []

View file

@ -0,0 +1,64 @@
from typing import Tuple
import requests
from app.extensions import db
from app.models.alarms import Alarm, AlarmState
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
def set_http_alarm(proxy_id: int, state: AlarmState, text: str):
alarm = Alarm.query.filter(
Alarm.proxy_id == proxy_id,
Alarm.alarm_type == "http-status"
).first()
if alarm is None:
alarm = Alarm()
alarm.proxy_id = proxy_id
alarm.alarm_type = "http-status"
alarm.target = "proxy"
db.session.add(alarm)
alarm.update_state(state, text)
class AlarmProxyHTTPStatusAutomation(BaseAutomation):
short_name = "alarm_http_status"
description = "Check all deployed proxies for HTTP status code"
def automate(self, full: bool = False) -> Tuple[bool, str]:
proxies = Proxy.query.filter(
Proxy.destroyed == None
)
for proxy in proxies:
try:
if proxy.url is None:
continue
r = requests.get(proxy.url,
allow_redirects=False,
timeout=5)
r.raise_for_status()
if r.is_redirect:
set_http_alarm(
proxy.id,
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
else:
set_http_alarm(
proxy.id,
AlarmState.OK,
f"{r.status_code} {r.reason}"
)
except (requests.ConnectionError, requests.Timeout):
set_http_alarm(
proxy.id,
AlarmState.CRITICAL,
f"Connection failure")
except requests.HTTPError:
set_http_alarm(
proxy.id,
AlarmState.CRITICAL,
f"{r.status_code} {r.reason}"
)
return True, []