feat(block): use json list for external block checks

This commit is contained in:
Iain Learmonth 2023-04-26 16:01:36 +01:00
parent c424b9a5fa
commit 19681d1eca

View file

@ -1,10 +1,29 @@
from bs4 import BeautifulSoup
import logging
import requests
from app import app
from app.terraform.block_mirror import BlockMirrorAutomation
def _trim_prefix(s: str, prefix: str) -> str:
if s.startswith(prefix):
return s[len(prefix):]
return s
def trim_http_https(s: str) -> str:
"""
Return the string with "http://" or "https://" removed from the start of the string if present.
:param s: String to modify.
:return: Modified string.
"""
return _trim_prefix(
_trim_prefix(s, "https://"),
"http://")
class BlockExternalAutomation(BlockMirrorAutomation):
"""
Automation task to import proxy reachability results from external source.
@ -16,17 +35,8 @@ class BlockExternalAutomation(BlockMirrorAutomation):
def fetch(self) -> None:
user_agent = {'User-agent': 'BypassCensorship/1.0'}
page = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent, timeout=30)
self._content = page.content
self._data = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent, timeout=30).json()
def parse(self) -> None:
soup = BeautifulSoup(self._content, 'html.parser')
h2 = soup.find_all('h2') # pylint: disable=invalid-name
div = soup.find_all('div', class_="overflow-auto mb-5")
i = 0
for idx, heading in enumerate(h2):
if not div[idx].div:
anchors = div[idx].find_all('a')
for anchor in anchors:
self.patterns.append("https://" + anchor.text)
i += 1
self.patterns.extend(["https://" + trim_http_https(pattern) for pattern in self._data])
logging.debug("Found URLs: %s", self.patterns)