From 19681d1ecaba57132fb5adb7078a1201ba4ae8f4 Mon Sep 17 00:00:00 2001 From: Iain Learmonth Date: Wed, 26 Apr 2023 16:01:36 +0100 Subject: [PATCH] feat(block): use json list for external block checks --- app/terraform/block_external.py | 36 +++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/app/terraform/block_external.py b/app/terraform/block_external.py index e47c77f..e648b94 100644 --- a/app/terraform/block_external.py +++ b/app/terraform/block_external.py @@ -1,10 +1,29 @@ -from bs4 import BeautifulSoup +import logging + import requests from app import app from app.terraform.block_mirror import BlockMirrorAutomation +def _trim_prefix(s: str, prefix: str) -> str: + if s.startswith(prefix): + return s[len(prefix):] + return s + + +def trim_http_https(s: str) -> str: + """ + Return the string with "http://" or "https://" removed from the start of the string if present. + + :param s: String to modify. + :return: Modified string. + """ + return _trim_prefix( + _trim_prefix(s, "https://"), + "http://") + + class BlockExternalAutomation(BlockMirrorAutomation): """ Automation task to import proxy reachability results from external source. @@ -16,17 +35,8 @@ class BlockExternalAutomation(BlockMirrorAutomation): def fetch(self) -> None: user_agent = {'User-agent': 'BypassCensorship/1.0'} - page = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent, timeout=30) - self._content = page.content + self._data = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent, timeout=30).json() def parse(self) -> None: - soup = BeautifulSoup(self._content, 'html.parser') - h2 = soup.find_all('h2') # pylint: disable=invalid-name - div = soup.find_all('div', class_="overflow-auto mb-5") - i = 0 - for idx, heading in enumerate(h2): - if not div[idx].div: - anchors = div[idx].find_all('a') - for anchor in anchors: - self.patterns.append("https://" + anchor.text) - i += 1 + self.patterns.extend(["https://" + trim_http_https(pattern) for pattern in self._data]) + logging.debug("Found URLs: %s", self.patterns)