import datetime import logging import random import string from collections import OrderedDict from typing import Any, Dict, List, Optional, Tuple, Type from typing import OrderedDict as OrderedDictT from tldextract import tldextract from app import db from app.models.base import Pool from app.models.mirrors import Proxy, Origin from app.terraform import BaseAutomation from app.terraform.proxy import ProxyAutomation from app.terraform.proxy.azure_cdn import ProxyAzureCdnAutomation from app.terraform.proxy.cloudfront import ProxyCloudfrontAutomation from app.terraform.proxy.fastly import ProxyFastlyAutomation PROXY_PROVIDERS: Dict[str, Type[ProxyAutomation]] = {p.provider: p for p in [ # type: ignore[attr-defined] # In order of preference ProxyCloudfrontAutomation, ProxyFastlyAutomation, ProxyAzureCdnAutomation ] if p.enabled} # type: ignore[attr-defined] SubgroupCount = OrderedDictT[str, OrderedDictT[int, OrderedDictT[int, int]]] def all_active_proxies() -> List[Proxy]: """ Retrieve all active proxies from the database. This function returns a list of all `Proxy` instances that are currently active. An active proxy is defined as a proxy that is not deprecated and not destroyed. :return: A list of all active Proxy instances. """ result: List[Proxy] = Proxy.query.filter( Proxy.deprecated.is_(None), Proxy.destroyed.is_(None), ).all() return result def random_slug(origin_domain_name: str) -> str: """ Generate a random slug consisting of a prefix extracted from a domain name and a series of random lower case letters. The function extracts the domain from the given `origin_domain_name`, trims it to the first 5 characters, and appends 12 random lower case letters. :param origin_domain_name: The domain name to extract the prefix from. :return: The generated random slug. :Example: >>> random_slug("example.com") "exampasdfghjkl" """ # The random slug doesn't need to be cryptographically secure, hence the use of `# nosec` return tldextract.extract(origin_domain_name).domain[:5] + ''.join( random.choices(string.ascii_lowercase, k=12)) # nosec def calculate_subgroup_count(proxies: Optional[List[Proxy]] = None) -> SubgroupCount: """ Calculate the count of each subgroup within each group for each provider. The function loops through the list of Proxy objects and creates a nested dictionary structure. The keys of the outermost dictionary are the providers. The values are dictionaries where the keys are the group IDs and the values are dictionaries where the keys are subgroups and the values are their counts. :param proxies: A list of Proxy objects. If None, the calculation will be performed on all active proxies. :return: A nested dictionary representing the count of each subgroup within each group for each provider. """ if proxies is None: proxies = all_active_proxies() subgroup_count: SubgroupCount = OrderedDict() for proxy in proxies: if proxy.provider not in subgroup_count: subgroup_count[proxy.provider] = OrderedDict() if proxy.origin.group_id not in subgroup_count[proxy.provider]: subgroup_count[proxy.provider][proxy.origin.group_id] = OrderedDict() if proxy.psg not in subgroup_count[proxy.provider][proxy.origin.group_id]: subgroup_count[proxy.provider][proxy.origin.group_id][proxy.psg] = 1 else: subgroup_count[proxy.provider][proxy.origin.group_id][proxy.psg] += 1 return subgroup_count def next_subgroup(subgroup_count: SubgroupCount, provider: str, group_id: int, max_subgroup_count: int, max_subgroup_members: int) -> Optional[int]: """ Find the first available subgroup with less than the specified maximum count in the specified provider and group. If the last subgroup in the group is full, return the next subgroup number as long as it doesn't exceed `max_subgroup`. The function traverses the `subgroup_count` dictionary for the given provider and group in the order of subgroup. It returns the first subgroup found with a count less than `max_count`. :param subgroup_count: A nested dictionary representing the count of each subgroup within each group for each provider. :param provider: The provider to find the next subgroup in. :param group_id: The group to find the next subgroup in. :param max_subgroup_count: The maximum allowable subgroup number. :param max_subgroup_members: The maximum count a subgroup should have to be considered available. :return: The subgroup of the first available subgroup within the specified provider and group. If no available subgroup is found and max_subgroup is not exceeded, returns the next subgroup number. If no subgroup is available and max_subgroup is exceeded, returns None. """ if provider in subgroup_count and group_id in subgroup_count[provider]: subgroups = subgroup_count[provider][group_id] for subgroup in range(1, max_subgroup_count + 1): if subgroup not in subgroups or subgroups[subgroup] < max_subgroup_members: return subgroup return None return 1 def auto_deprecate_proxies() -> None: """ Automatically deprecate proxies based on certain conditions. This function deprecates proxies under two conditions: 1. The origin of the proxy has been destroyed. 2. The proxy belongs to a list of origins due for daily replacement and has reached its max age. .. note:: - The "origin_destroyed" reason means the origin of the proxy has been destroyed. - The "max_age_reached" reason means the proxy has been in use for longer than the maximum allowed period. The maximum age cutoff is randomly set to a time between 24 and 48 hours. """ proxies: List[Proxy] = all_active_proxies() for proxy in proxies: if proxy.origin.destroyed is not None: proxy.deprecate(reason="origin_destroyed") if proxy.origin.assets and proxy.origin.auto_rotation: max_age_cutoff = datetime.datetime.utcnow() - datetime.timedelta( days=1, seconds=86400 * random.random()) # nosec: B311 if proxy.added < max_age_cutoff: proxy.deprecate(reason="max_age_reached") def destroy_expired_proxies() -> None: """ Destroy proxies that have been deprecated for a certain period of time. This function finds all proxies that are not already destroyed and have been deprecated for more than 4 days. It then destroys these proxies. """ expiry_cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=4) proxies = Proxy.query.filter( Proxy.destroyed.is_(None), Proxy.deprecated < expiry_cutoff ).all() for proxy in proxies: logging.debug("Destroying expired proxy") proxy.destroy() def promote_hot_spare_proxy(pool_id: int, origin: Origin) -> bool: """ Promote a 'hot spare' proxy to a specified pool from the reserve pool. This function searches for a 'hot spare' proxy (a proxy in reserve pool with pool_id == -1) for the given origin. If a proxy is found, it is promoted to the specified pool by changing its pool ID. The added timestamp is also reset to the time at which the hot spare was promoted. :param pool_id: The pool to which the 'hot spare' proxy is to be promoted. :param origin: The origin of the 'hot spare' proxy to be promoted. :return: True if a 'hot spare' proxy was found and promoted, False otherwise. .. note:: In the database, the pool ID -1 signifies a reserve pool of 'hot spare' proxies. This pool is created by default in the schema migrations. """ proxy = Proxy.query.filter( Proxy.pool_id == -1, Proxy.origin_id == origin.id, ).first() if not proxy: return False proxy.pool_id = pool_id proxy.added = datetime.datetime.utcnow() return True class ProxyMetaAutomation(BaseAutomation): short_name = "proxy_meta" description = "Housekeeping for proxies" frequency = 1 subgroup_count: SubgroupCount def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.subgroup_count = calculate_subgroup_count() def automate(self, full: bool = False) -> Tuple[bool, str]: # Deprecate orphaned proxies, old proxies and mismatched proxies auto_deprecate_proxies() destroy_expired_proxies() self.handle_missing_proxies() self.create_hot_spare_proxies() db.session.commit() return True, "" def handle_missing_proxies(self) -> None: """ Create new proxies for origins that lack active proxies in a pool. This function iterates over all pools, groups in each pool, and origins in each group. If an origin is not destroyed and lacks active (not deprecated and not destroyed) proxies in a pool, a new proxy for the origin in the pool is created. """ pools = Pool.query.all() for pool in pools: for group in pool.groups: for origin in group.origins: if origin.destroyed is not None: continue proxies = [ x for x in origin.proxies if x.pool_id == pool.id and x.deprecated is None and x.destroyed is None ] if not proxies: logging.debug("Creating new proxy for %s in pool %s", origin, pool) if not promote_hot_spare_proxy(pool.id, origin): # No "hot spare" available self.create_proxy(pool.id, origin) def create_proxy(self, pool_id: int, origin: Origin) -> bool: """ Creates a web proxy resource for the given origin and pool combination. Initially it will attempt to create smart proxies on providers that support smart proxies, and "simple" proxies on other providers. If other providers have exhausted their quota already then a "simple" proxy may be created on a platform that supports smart proxies. A boolean is returned to indicate whether a proxy resource was created. :param pool_id: pool to create the resource for :param origin: origin to create the resource for :return: whether a proxy resource was created """ for provider in PROXY_PROVIDERS.values(): logging.debug("Looking at provider %s", provider.provider) subgroup = next_subgroup(self.subgroup_count, provider.provider, origin.group_id, provider.subgroup_members_max, provider.subgroup_count_max) if subgroup is None: continue # Exceeded maximum number of subgroups and last subgroup is full self.increment_subgroup(provider.provider, origin.group_id, subgroup) proxy = Proxy() proxy.pool_id = pool_id proxy.origin_id = origin.id proxy.provider = provider.provider proxy.psg = subgroup # The random usage below is good enough for its purpose: to create a slug that # hasn't been used recently. proxy.slug = random_slug(origin.domain_name) proxy.added = datetime.datetime.utcnow() proxy.updated = datetime.datetime.utcnow() logging.debug("Creating proxy %s", proxy) db.session.add(proxy) return True return False def increment_subgroup(self, provider: str, group_id: int, psg: int) -> None: """ Increment the count of a specific subgroup within a group for a specific provider. This function mutates the `subgroup_count` dictionary by incrementing the count of the specified subgroup. If the provider, group, or subgroup does not exist in `subgroup_count`, they are created. :param provider: The provider to increment the subgroup count for. :param group_id: The group to increment the subgroup count for. :param psg: The subgroup to increment the count of. """ if provider not in self.subgroup_count: self.subgroup_count[provider] = OrderedDict() if group_id not in self.subgroup_count[provider]: self.subgroup_count[provider][group_id] = OrderedDict() if psg not in self.subgroup_count[provider][group_id]: self.subgroup_count[provider][group_id][psg] = 0 self.subgroup_count[provider][group_id][psg] += 1 def create_hot_spare_proxies(self) -> None: """ Create 'hot spare' proxies for origins that lack active proxies. This function iterates over all groups and their origins. If an origin is not destroyed and lacks active proxies (not deprecated and not destroyed), a new 'hot spare' proxy for this origin is created in the reserve pool (with pool_id = -1). """ origins = Origin.query.filter( Origin.destroyed.is_(None) ).all() for origin in origins: if origin.destroyed is not None: continue proxies = Proxy.query.filter( Proxy.pool_id == -1, Proxy.origin_id == origin.id, Proxy.deprecated.is_(None), Proxy.destroyed.is_(None), ).all() if not proxies: logging.debug("Creating new hot spare proxy for origin %s", origin) self.create_proxy(-1, origin) # Creating proxy in reserve pool