2022-03-10 14:26:22 +00:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import requests
|
|
|
|
|
|
|
|
from app import app
|
2022-06-18 12:36:54 +01:00
|
|
|
from app.terraform.block_mirror import BlockMirrorAutomation
|
2022-05-09 08:09:57 +01:00
|
|
|
|
|
|
|
|
2022-06-18 12:36:54 +01:00
|
|
|
class BlockExternalAutomation(BlockMirrorAutomation):
|
2022-06-17 12:42:42 +01:00
|
|
|
"""
|
|
|
|
Automation task to import proxy reachability results from external source.
|
|
|
|
"""
|
2022-05-09 08:09:57 +01:00
|
|
|
short_name = "block_external"
|
|
|
|
description = "Import proxy reachability results from external source"
|
|
|
|
|
2022-06-18 12:36:54 +01:00
|
|
|
_content: bytes
|
2022-06-17 12:42:42 +01:00
|
|
|
|
2022-06-18 12:48:09 +01:00
|
|
|
def fetch(self) -> None:
|
2022-05-09 08:09:57 +01:00
|
|
|
user_agent = {'User-agent': 'BypassCensorship/1.0'}
|
2023-03-12 12:28:29 +00:00
|
|
|
page = requests.get(app.config['EXTERNAL_CHECK_URL'], headers=user_agent, timeout=30)
|
2022-06-18 12:36:54 +01:00
|
|
|
self._content = page.content
|
2022-05-09 08:09:57 +01:00
|
|
|
|
2022-06-18 12:48:09 +01:00
|
|
|
def parse(self) -> None:
|
2022-06-18 12:36:54 +01:00
|
|
|
soup = BeautifulSoup(self._content, 'html.parser')
|
2022-06-17 12:42:42 +01:00
|
|
|
h2 = soup.find_all('h2') # pylint: disable=invalid-name
|
|
|
|
div = soup.find_all('div', class_="overflow-auto mb-5")
|
2022-05-09 08:09:57 +01:00
|
|
|
i = 0
|
2022-06-18 12:36:54 +01:00
|
|
|
for idx, heading in enumerate(h2):
|
2022-10-15 05:50:04 +01:00
|
|
|
if not div[idx].div:
|
2022-06-18 12:36:54 +01:00
|
|
|
anchors = div[idx].find_all('a')
|
|
|
|
for anchor in anchors:
|
|
|
|
self.patterns.append("https://" + anchor.text)
|
2022-05-09 08:09:57 +01:00
|
|
|
i += 1
|