majuna/app/terraform/block_roskomsvoboda.py

import json
import logging
from io import BytesIO
from typing import Any, Optional
from zipfile import ZipFile, BadZipFile

import lxml  # nosec: B410
import requests
from lxml.etree import XMLSyntaxError  # nosec: B410

from app.extensions import db
from app.models.activity import Activity
from app.models.tfstate import TerraformState
from app.terraform.block_mirror import BlockMirrorAutomation

# TODO: Security considerations for lxml
#
# This module makes use of lxml for parsing XML. There are some known issues relating to
# malicious XML being crafted to exploit XML parses such that they will exhaust available
# CPU and RAM. Here we use the event-driven parser and disable entity resolution so this
# should help to reduce the risks however a more in-depth review would be good in the future.


class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):
    """
    Automation task to import Russian blocklist from RosKomSvoboda.

    This task will import the Russian state register of prohibited sites,
    which is part of the enforcement of federal laws of the Russian Federation
    No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate
    the dissemination of information on the Internet.

    Where proxies are found to be blocked they will be rotated.
    """
    short_name = "block_roskomsvoboda"
    description = "Import Russian blocklist from RosKomSvoboda"
    frequency = 300

    _data: Any

    def _fetch(self, latest_rev: str) -> None:
        self._data = None
        try:
            # This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
            # data than that we are verifying the certificate.
            r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", timeout=180, verify=False)  # nosec: B501
            r.raise_for_status()
            zip_file = ZipFile(BytesIO(r.content))
            self._data = zip_file.read("dump.xml")
            logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)
        except requests.HTTPError:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()
        except BadZipFile:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "
                      "related to the format of the zip file. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()

    def fetch(self) -> None:
        state: Optional[TerraformState] = TerraformState.query.filter(
            TerraformState.key == "block_roskomsvoboda").first()
        if state is None:
            state = TerraformState()
            state.key = "block_roskomsvoboda"
            db.session.add(state)
            latest_metadata = {"dump_rev": "0"}
        else:
            latest_metadata = json.loads(state.state)
        # This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
        # data than that we are verifying the certificate.
        latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", timeout=30, verify=False).text.strip()  # nosec: B501
        logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])
        if latest_rev != latest_metadata["dump_rev"]:
            state.state = json.dumps({"dump_rev": latest_rev})
            db.session.commit()
            self._fetch(latest_rev)
        else:
            self._data = None

    def parse(self) -> None:
        if not self._data:
            logging.debug("No new data to parse")
            return
        try:
            for _event, element in lxml.etree.iterparse(BytesIO(self._data),
                                                        resolve_entities=False):
                if element.tag == "domain":
                    self.patterns["roskomsvoboda"].append("https://" + element.text.strip())
        except XMLSyntaxError:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "
                      "related to the format of the XML file within the zip file. Interestingly we were able to "
                      "extract the file from the zip file fine. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()
        logging.debug("Found %s patterns", len(self.patterns))
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`import json`
			`import logging`
			`from io import BytesIO`
			`from typing import Any, Optional`
			`from zipfile import ZipFile, BadZipFile`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00
fix(block): security updates for rks import 2023-05-03 14:47:25 +01:00			`import lxml # nosec: B410`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00			`import requests`
fix(block): security updates for rks import 2023-05-03 14:47:25 +01:00			`from lxml.etree import XMLSyntaxError # nosec: B410`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`from app.extensions import db`
			`from app.models.activity import Activity`
			`from app.models.tfstate import TerraformState`
block: extend BlockMirror not Base for RKS 2022-06-18 13:01:18 +01:00			`from app.terraform.block_mirror import BlockMirrorAutomation`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00
fix(block): security updates for rks import 2023-05-03 14:47:25 +01:00			`# TODO: Security considerations for lxml`
			`#`
			`# This module makes use of lxml for parsing XML. There are some known issues relating to`
			`# malicious XML being crafted to exploit XML parses such that they will exhaust available`
			`# CPU and RAM. Here we use the event-driven parser and disable entity resolution so this`
			`# should help to reduce the risks however a more in-depth review would be good in the future.`

block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00
block: extend BlockMirror not Base for RKS 2022-06-18 13:01:18 +01:00			`class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):`
lint: tidying up code in block tasks 2022-06-17 12:42:42 +01:00			`"""`
			`Automation task to import Russian blocklist from RosKomSvoboda.`

			`This task will import the Russian state register of prohibited sites,`
			`which is part of the enforcement of federal laws of the Russian Federation`
			`No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate`
			`the dissemination of information on the Internet.`

			`Where proxies are found to be blocked they will be rotated.`
			`"""`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00			`short_name = "block_roskomsvoboda"`
			`description = "Import Russian blocklist from RosKomSvoboda"`
block/roskomsvoboda: switch to updating every 300 minutes data is now updated every 3 hours 2022-11-13 13:21:40 +00:00			`frequency = 300`
block: import blocklist from roskomsvoboda 2022-05-09 14:11:05 +01:00
block: try to unify the mirror block modules 2022-06-18 12:36:54 +01:00			`_data: Any`

fix(block): security updates for rks import 2023-05-03 14:47:25 +01:00			`def _fetch(self, latest_rev: str) -> None:`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`self._data = None`
			`try:`
Explain why we do not verify roskomsvoboda certificate 2024-02-19 12:17:49 +00:00			`# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the`
			`# data than that we are verifying the certificate.`
			`r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", timeout=180, verify=False) # nosec: B501`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`r.raise_for_status()`
			`zip_file = ZipFile(BytesIO(r.content))`
			`self._data = zip_file.read("dump.xml")`
			`logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)`
			`except requests.HTTPError:`
			`activity = Activity(`
			`activity_type="automation",`
			`text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "`
			`"The automation task has not been disabled and will attempt to download the next dump when the "`
			`"latest dump revision is incremented at the server."))`
			`activity.notify()`
			`db.session.add(activity)`
			`db.session.commit()`
			`except BadZipFile:`
			`activity = Activity(`
			`activity_type="automation",`
			`text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "`
			`"related to the format of the zip file. "`
			`"The automation task has not been disabled and will attempt to download the next dump when the "`
			`"latest dump revision is incremented at the server."))`
			`activity.notify()`
			`db.session.add(activity)`
			`db.session.commit()`

block: add typing hints 2022-06-18 13:17:36 +01:00			`def fetch(self) -> None:`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`state: Optional[TerraformState] = TerraformState.query.filter(`
			`TerraformState.key == "block_roskomsvoboda").first()`
			`if state is None:`
			`state = TerraformState()`
			`state.key = "block_roskomsvoboda"`
			`db.session.add(state)`
			`latest_metadata = {"dump_rev": "0"}`
			`else:`
			`latest_metadata = json.loads(state.state)`
Explain why we do not verify roskomsvoboda certificate 2024-02-19 12:17:49 +00:00			`# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the`
			`# data than that we are verifying the certificate.`
			`latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", timeout=30, verify=False).text.strip() # nosec: B501`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])`
			`if latest_rev != latest_metadata["dump_rev"]:`
			`state.state = json.dumps({"dump_rev": latest_rev})`
			`db.session.commit()`
			`self._fetch(latest_rev)`
			`else:`
			`self._data = None`
block: try to unify the mirror block modules 2022-06-18 12:36:54 +01:00
block: add typing hints 2022-06-18 13:17:36 +01:00			`def parse(self) -> None:`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`if not self._data:`
			`logging.debug("No new data to parse")`
			`return`
			`try:`
fix(block): security updates for rks import 2023-05-03 14:47:25 +01:00			`for _event, element in lxml.etree.iterparse(BytesIO(self._data),`
			`resolve_entities=False):`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`if element.tag == "domain":`
feat: geo risk scores 2023-10-29 15:45:10 +00:00			`self.patterns["roskomsvoboda"].append("https://" + element.text.strip())`
feat(block): use roskomsvoboda private api 2023-04-26 16:21:12 +01:00			`except XMLSyntaxError:`
			`activity = Activity(`
			`activity_type="automation",`
			`text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "`
			`"related to the format of the XML file within the zip file. Interestingly we were able to "`
			`"extract the file from the zip file fine. "`
			`"The automation task has not been disabled and will attempt to download the next dump when the "`
			`"latest dump revision is incremented at the server."))`
			`activity.notify()`
			`db.session.add(activity)`
			`db.session.commit()`
			`logging.debug("Found %s patterns", len(self.patterns))`