majuna/app/terraform/block_external.py

97 lines
3.6 KiB
Python
Raw Normal View History

2022-05-23 10:55:59 +01:00
import datetime
2022-06-17 12:42:42 +01:00
from typing import Tuple, List, Dict
2022-03-10 14:26:22 +00:00
from bs4 import BeautifulSoup
import requests
from app import app
2022-05-01 16:23:45 +01:00
from app.extensions import db
from app.models.activity import Activity
2022-04-22 14:01:16 +01:00
from app.models.mirrors import Proxy
from app.terraform import BaseAutomation
class BlockExternalAutomation(BaseAutomation):
2022-06-17 12:42:42 +01:00
"""
Automation task to import proxy reachability results from external source.
"""
short_name = "block_external"
description = "Import proxy reachability results from external source"
2022-06-17 12:42:42 +01:00
content: bytes
results: Dict[str, List[str]]
def _fetch(self) -> None:
user_agent = {'User-agent': 'BypassCensorship/1.0'}
page = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent)
2022-06-17 12:42:42 +01:00
self.content = page.content
2022-06-17 12:42:42 +01:00
def _parse(self) -> None:
soup = BeautifulSoup(self.content, 'html.parser')
h2 = soup.find_all('h2') # pylint: disable=invalid-name
div = soup.find_all('div', class_="overflow-auto mb-5")
results = {}
i = 0
while i < len(h2):
if not div[i].div:
urls = []
2022-06-17 12:42:42 +01:00
anchors = div[i].find_all('a')
j = 0
2022-06-17 12:42:42 +01:00
while j < len(anchors):
urls.append(anchors[j].text)
j += 1
results[h2[i].text] = urls
else:
results[h2[i].text] = []
i += 1
2022-06-17 12:42:42 +01:00
self.results = results
def automate(self, full: bool = False) -> Tuple[bool, str]:
# TODO: handle errors in fetching remote content
# TODO: handle errors in parsing the remote content
self._fetch()
self._parse()
activities = []
2022-05-23 10:55:59 +01:00
blocked_proxies = []
2022-06-17 12:42:42 +01:00
for vantage_point, urls in self.results.items():
if vantage_point not in app.config['EXTERNAL_VANTAGE_POINTS']:
continue
2022-06-17 12:42:42 +01:00
for url in urls:
print(f"Found {url} blocked")
proxy = Proxy.query.filter(
Proxy.provider == "cloudfront",
Proxy.url == f"https://{url}"
).first()
if not proxy:
print("Proxy not found")
continue
if not proxy.origin.auto_rotation:
print("Proxy auto-rotation forbidden for origin")
continue
2022-05-25 09:20:57 +01:00
if proxy.added > datetime.datetime.utcnow() - datetime.timedelta(hours=3):
2022-05-23 10:55:59 +01:00
activities.append(Activity(
activity_type="block_warning",
text=(
f"Proxy {proxy.url} for {proxy.origin.domain_name} detected blocked according to "
"external source. REFUSING to rotate because this proxy is less than 3 hours old.")))
continue
blocked_proxies.append(proxy)
if len(blocked_proxies) <= 15:
for proxy in blocked_proxies:
activities.append(Activity(
activity_type="block",
2022-05-23 10:55:59 +01:00
text=(f"Proxy {proxy.url} for {proxy.origin.domain_name} detected blocked according to external "
"source. Rotation scheduled.")
))
proxy.deprecate(reason="external")
2022-05-23 10:55:59 +01:00
else:
activities.append(Activity(
activity_type="block_warning",
text=(
"More than 15 proxies were marked blocked according to external source. REFUSING to rotate.")))
2022-06-17 12:42:42 +01:00
for activity in activities:
activity.notify()
db.session.add(activity)
db.session.commit()
return True, ""