import json import logging from io import BytesIO from typing import Any, Optional from zipfile import BadZipFile, ZipFile import lxml # nosec: B410 import requests from lxml.etree import XMLSyntaxError # nosec: B410 from app.extensions import db from app.models.activity import Activity from app.models.tfstate import TerraformState from app.terraform.block_mirror import BlockMirrorAutomation # TODO: Security considerations for lxml # # This module makes use of lxml for parsing XML. There are some known issues relating to # malicious XML being crafted to exploit XML parses such that they will exhaust available # CPU and RAM. Here we use the event-driven parser and disable entity resolution so this # should help to reduce the risks however a more in-depth review would be good in the future. class BlockRoskomsvobodaAutomation(BlockMirrorAutomation): """ Automation task to import Russian blocklist from RosKomSvoboda. This task will import the Russian state register of prohibited sites, which is part of the enforcement of federal laws of the Russian Federation No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate the dissemination of information on the Internet. Where proxies are found to be blocked they will be rotated. """ short_name = "block_roskomsvoboda" description = "Import Russian blocklist from RosKomSvoboda" frequency = 300 _data: Any def _fetch(self, latest_rev: str) -> None: self._data = None try: # This endpoint routinely has an expired certificate, and it's more useful that we are consuming the # data than that we are verifying the certificate. r = requests.get( f"https://dumps.rublacklist.net/fetch/{latest_rev}", timeout=180, verify=False, ) # nosec: B501 r.raise_for_status() zip_file = ZipFile(BytesIO(r.content)) self._data = zip_file.read("dump.xml") logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev) except requests.HTTPError: activity = Activity( activity_type="automation", text=( f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server." ), ) activity.notify() db.session.add(activity) db.session.commit() except BadZipFile: activity = Activity( activity_type="automation", text=( f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error " "related to the format of the zip file. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server." ), ) activity.notify() db.session.add(activity) db.session.commit() def fetch(self) -> None: state: Optional[TerraformState] = TerraformState.query.filter( TerraformState.key == "block_roskomsvoboda" ).first() if state is None: state = TerraformState() state.key = "block_roskomsvoboda" db.session.add(state) latest_metadata = {"dump_rev": "0"} else: latest_metadata = json.loads(state.state) # This endpoint routinely has an expired certificate, and it's more useful that we are consuming the # data than that we are verifying the certificate. latest_rev = requests.get( "https://dumps.rublacklist.net/fetch/latest", timeout=30, verify=False ).text.strip() # nosec: B501 logging.debug( "Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"], ) if latest_rev != latest_metadata["dump_rev"]: state.state = json.dumps({"dump_rev": latest_rev}) db.session.commit() self._fetch(latest_rev) else: self._data = None def parse(self) -> None: if not self._data: logging.debug("No new data to parse") return try: for _event, element in lxml.etree.iterparse( BytesIO(self._data), resolve_entities=False ): if element.tag == "domain": self.patterns["roskomsvoboda"].append( "https://" + element.text.strip() ) except XMLSyntaxError: activity = Activity( activity_type="automation", text=( f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error " "related to the format of the XML file within the zip file. Interestingly we were able to " "extract the file from the zip file fine. " "The automation task has not been disabled and will attempt to download the next dump when the " "latest dump revision is incremented at the server." ), ) activity.notify() db.session.add(activity) db.session.commit() logging.debug("Found %s patterns", len(self.patterns))