From 360c78661046b2e43fc0e681fa8b1259fe8d3e48 Mon Sep 17 00:00:00 2001 From: Iain Learmonth Date: Sat, 18 Jun 2022 12:36:54 +0100 Subject: [PATCH] block: try to unify the mirror block modules --- app/models/__init__.py | 18 +++++- app/terraform/block_external.py | 84 ++++------------------------ app/terraform/block_mirror.py | 79 ++++++++++++++++++++++++++ app/terraform/block_roskomsvoboda.py | 42 +++----------- 4 files changed, 114 insertions(+), 109 deletions(-) create mode 100644 app/terraform/block_mirror.py diff --git a/app/models/__init__.py b/app/models/__init__.py index 59eda24..76a959a 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -78,18 +78,32 @@ class AbstractResource(db.Model): # type: ignore def brn(self) -> BRN: raise NotImplementedError() - def deprecate(self, *, reason: str) -> None: + def deprecate(self, *, reason: str) -> bool: + """ + Marks the resource as deprecated. In the event that the resource was already + deprecated, no change will be recorded and the function will return False. + + :param reason: an opaque string that records the deprecation reason + :return: if the proxy was deprecated + """ if self.deprecated is not None: logging.info("Deprecating %s (reason=%s)", self.brn, reason) self.deprecated = datetime.utcnow() self.deprecation_reason = reason self.updated = datetime.utcnow() + return True else: logging.info("Not deprecating %s (reason=%s) because it's already deprecated", self.brn, reason) + return False def destroy(self) -> None: + """ + Marks the resource for destruction. + + :return: None + """ if self.deprecated is None: - self.deprecated = datetime.utcnow() + self.deprecate(reason="destroyed") self.destroyed = datetime.utcnow() self.updated = datetime.utcnow() diff --git a/app/terraform/block_external.py b/app/terraform/block_external.py index 2c52976..3afcee3 100644 --- a/app/terraform/block_external.py +++ b/app/terraform/block_external.py @@ -1,96 +1,34 @@ -import datetime -from typing import Tuple, List, Dict +from typing import List, Dict from bs4 import BeautifulSoup import requests from app import app -from app.extensions import db -from app.models.activity import Activity -from app.models.mirrors import Proxy -from app.terraform import BaseAutomation +from app.terraform.block_mirror import BlockMirrorAutomation -class BlockExternalAutomation(BaseAutomation): +class BlockExternalAutomation(BlockMirrorAutomation): """ Automation task to import proxy reachability results from external source. """ short_name = "block_external" description = "Import proxy reachability results from external source" - content: bytes - results: Dict[str, List[str]] + _content: bytes def _fetch(self) -> None: user_agent = {'User-agent': 'BypassCensorship/1.0'} page = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent) - self.content = page.content + self._content = page.content def _parse(self) -> None: - soup = BeautifulSoup(self.content, 'html.parser') + soup = BeautifulSoup(self._content, 'html.parser') h2 = soup.find_all('h2') # pylint: disable=invalid-name div = soup.find_all('div', class_="overflow-auto mb-5") - results = {} i = 0 - while i < len(h2): - if not div[i].div: - urls = [] - anchors = div[i].find_all('a') - j = 0 - while j < len(anchors): - urls.append(anchors[j].text) - j += 1 - results[h2[i].text] = urls - else: - results[h2[i].text] = [] + for idx, heading in enumerate(h2): + if not div[idx].div and heading.text in app.config['EXTERNAL_VANTAGE_POINTS']: + anchors = div[idx].find_all('a') + for anchor in anchors: + self.patterns.append("https://" + anchor.text) i += 1 - self.results = results - - def automate(self, full: bool = False) -> Tuple[bool, str]: - # TODO: handle errors in fetching remote content - # TODO: handle errors in parsing the remote content - self._fetch() - self._parse() - activities = [] - blocked_proxies = [] - for vantage_point, urls in self.results.items(): - if vantage_point not in app.config['EXTERNAL_VANTAGE_POINTS']: - continue - for url in urls: - print(f"Found {url} blocked") - proxy = Proxy.query.filter( - Proxy.provider == "cloudfront", - Proxy.url == f"https://{url}" - ).first() - if not proxy: - print("Proxy not found") - continue - if not proxy.origin.auto_rotation: - print("Proxy auto-rotation forbidden for origin") - continue - if proxy.added > datetime.datetime.utcnow() - datetime.timedelta(hours=3): - activities.append(Activity( - activity_type="block_warning", - text=( - f"Proxy {proxy.url} for {proxy.origin.domain_name} detected blocked according to " - "external source. REFUSING to rotate because this proxy is less than 3 hours old."))) - continue - blocked_proxies.append(proxy) - if len(blocked_proxies) <= 15: - for proxy in blocked_proxies: - activities.append(Activity( - activity_type="block", - text=(f"Proxy {proxy.url} for {proxy.origin.domain_name} detected blocked according to external " - "source. Rotation scheduled.") - )) - proxy.deprecate(reason="external") - else: - activities.append(Activity( - activity_type="block_warning", - text=( - "More than 15 proxies were marked blocked according to external source. REFUSING to rotate."))) - for activity in activities: - activity.notify() - db.session.add(activity) - db.session.commit() - return True, "" diff --git a/app/terraform/block_mirror.py b/app/terraform/block_mirror.py new file mode 100644 index 0000000..cbd22e3 --- /dev/null +++ b/app/terraform/block_mirror.py @@ -0,0 +1,79 @@ +from datetime import datetime, timedelta +import logging +from abc import abstractmethod +from fnmatch import fnmatch +from typing import Tuple, List + +from app.extensions import db +from app.models.activity import Activity +from app.models.mirrors import Proxy +from app.terraform import BaseAutomation + + +class BlockMirrorAutomation(BaseAutomation): + + patterns: List[str] + + def __init__(self): + """ + Constructor method. + """ + self.patterns = list() + super().__init__() + + def automate(self, full: bool = False) -> Tuple[bool, str]: + self.fetch() + self.parse() + rotated = list() + for pattern in self.patterns: + for proxy in active_proxies(): + if proxy.url is None: + # Not ready yet + continue + if fnmatch(proxy.url, pattern): + logging.debug("Found %s blocked", proxy.url) + if not proxy.origin.auto_rotation: + logging.debug("Proxy auto-rotation forbidden for origin") + continue + if proxy.added > datetime.utcnow() - timedelta(hours=3): + logging.debug("Not rotating a proxy less than 3 hours old") + continue + if proxy.deprecate(reason=self.short_name): + logging.info("Rotated %s", proxy.url) + rotated.append((proxy.url, proxy.origin.domain_name)) + else: + logging.debug("Not rotating a proxy that is already deprecated") + if rotated: + activity = Activity( + activity_type="block", + text=(f"[{self.short_name}] ♻ Rotated {len(rotated)} proxies️️: \n" + + "\n".join([f"* {proxy_domain} ({origin_domain})" for proxy_domain, origin_domain in rotated])) + ) + db.session.add(activity) + db.session.commit() + activity.notify() + return True, "" + + @abstractmethod + def fetch(self): + """ + Fetch the blocklist data. It is the responsibility of the automation task + to persist this within the object for the parse step. + + :return: None + """ + + @abstractmethod + def parse(self): + """ + Parse the blocklist data. + + :return: None + """ + + +def active_proxies() -> List[Proxy]: + return Proxy.query.filter( + Proxy.deprecated.is_(None), + Proxy.destroyed.is_(None) + ).all() diff --git a/app/terraform/block_roskomsvoboda.py b/app/terraform/block_roskomsvoboda.py index 0739631..4d066c7 100644 --- a/app/terraform/block_roskomsvoboda.py +++ b/app/terraform/block_roskomsvoboda.py @@ -1,11 +1,7 @@ -from fnmatch import fnmatch -from typing import Tuple, List +from typing import Any import requests -from app.extensions import db -from app.models.activity import Activity -from app.models.mirrors import Proxy from app.terraform import BaseAutomation @@ -24,32 +20,10 @@ class BlockRoskomsvobodaAutomation(BaseAutomation): description = "Import Russian blocklist from RosKomSvoboda" frequency = 90 - def automate(self, full: bool = False) -> Tuple[bool, str]: - activities = [] - proxies: List[Proxy] = Proxy.query.filter( - Proxy.deprecated.is_(None), - Proxy.destroyed.is_(None) - ).all() - patterns = requests.get("https://reestr.rublacklist.net/api/v2/domains/json").json() - for pattern in patterns: - for proxy in proxies: - if proxy.url is None: - # Not ready yet - continue - if fnmatch(proxy.url[len("https://"):], pattern): - print(f"Found {proxy.url} blocked") - if not proxy.origin.auto_rotation: - print("Proxy auto-rotation forbidden for origin") - continue - proxy.deprecate(reason="roskomsvoboda") - activities.append(Activity( - activity_type="block", - text=(f"Proxy {proxy.url} for {proxy.origin.domain_name} detected blocked " - "according to RosKomSvoboda. Rotation scheduled.") - )) - for activity in activities: - db.session.add(activity) - db.session.commit() - for activity in activities: - activity.notify() - return True, "" + _data: Any + + def fetch(self): + self._data = requests.get("https://reestr.rublacklist.net/api/v2/domains/json").json() + + def parse(self): + self.patterns.extend(["https://" + pattern for pattern in self._data])