import json import logging from io import BytesIO from typing import Any, Optional from zipfile import ZipFile, BadZipFile import lxml import requests from lxml.etree import XMLSyntaxError from app.extensions import db from app.models.activity import Activity from app.models.tfstate import TerraformState from app.terraform.block_mirror import BlockMirrorAutomation class BlockRoskomsvobodaAutomation(BlockMirrorAutomation): """ Automation task to import Russian blocklist from RosKomSvoboda. This task will import the Russian state register of prohibited sites, which is part of the enforcement of federal laws of the Russian Federation No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate the dissemination of information on the Internet. Where proxies are found to be blocked they will be rotated. """ short_name = "block_roskomsvoboda" description = "Import Russian blocklist from RosKomSvoboda" frequency = 300 _data: Any def _fetch(self, latest_rev) -> None: self._data = None try: r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", verify=False, timeout=180) r.raise_for_status() zip_file = ZipFile(BytesIO(r.content)) self._data = zip_file.read("dump.xml") logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev) except requests.HTTPError: activity = Activity( activity_type="automation", text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server.")) activity.notify() db.session.add(activity) db.session.commit() except BadZipFile: activity = Activity( activity_type="automation", text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error " "related to the format of the zip file. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server.")) activity.notify() db.session.add(activity) db.session.commit() def fetch(self) -> None: state: Optional[TerraformState] = TerraformState.query.filter( TerraformState.key == "block_roskomsvoboda").first() if state is None: state = TerraformState() state.key = "block_roskomsvoboda" db.session.add(state) latest_metadata = {"dump_rev": "0"} else: latest_metadata = json.loads(state.state) latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", verify=False, timeout=30).text.strip() logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"]) if latest_rev != latest_metadata["dump_rev"]: state.state = json.dumps({"dump_rev": latest_rev}) db.session.commit() self._fetch(latest_rev) else: self._data = None def parse(self) -> None: if not self._data: logging.debug("No new data to parse") return try: for _event, element in lxml.etree.iterparse(BytesIO(self._data)): if element.tag == "domain": self.patterns.append("https://" + element.text.strip()) except XMLSyntaxError: activity = Activity( activity_type="automation", text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error " "related to the format of the XML file within the zip file. Interestingly we were able to " "extract the file from the zip file fine. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server.")) activity.notify() db.session.add(activity) db.session.commit() logging.debug("Found %s patterns", len(self.patterns))