majuna/app/terraform/block_roskomsvoboda.py

import json
import logging
from io import BytesIO
from typing import Any, Optional
from zipfile import ZipFile, BadZipFile

import lxml
import requests
from lxml.etree import XMLSyntaxError

from app.extensions import db
from app.models.activity import Activity
from app.models.tfstate import TerraformState
from app.terraform.block_mirror import BlockMirrorAutomation


class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):
    """
    Automation task to import Russian blocklist from RosKomSvoboda.

    This task will import the Russian state register of prohibited sites,
    which is part of the enforcement of federal laws of the Russian Federation
    No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate
    the dissemination of information on the Internet.

    Where proxies are found to be blocked they will be rotated.
    """
    short_name = "block_roskomsvoboda"
    description = "Import Russian blocklist from RosKomSvoboda"
    frequency = 300

    _data: Any

    def _fetch(self, latest_rev) -> None:
        self._data = None
        try:
            r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}",
                         verify=False, timeout=180)
            r.raise_for_status()
            zip_file = ZipFile(BytesIO(r.content))
            self._data = zip_file.read("dump.xml")
            logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)
        except requests.HTTPError:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()
        except BadZipFile:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "
                      "related to the format of the zip file. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()

    def fetch(self) -> None:
        state: Optional[TerraformState] = TerraformState.query.filter(
            TerraformState.key == "block_roskomsvoboda").first()
        if state is None:
            state = TerraformState()
            state.key = "block_roskomsvoboda"
            db.session.add(state)
            latest_metadata = {"dump_rev": "0"}
        else:
            latest_metadata = json.loads(state.state)
        latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest",
                                  verify=False, timeout=30).text.strip()
        logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])
        if latest_rev != latest_metadata["dump_rev"]:
            state.state = json.dumps({"dump_rev": latest_rev})
            db.session.commit()
            self._fetch(latest_rev)
        else:
            self._data = None

    def parse(self) -> None:
        if not self._data:
            logging.debug("No new data to parse")
            return
        try:
            for _event, element in lxml.etree.iterparse(BytesIO(self._data)):
                if element.tag == "domain":
                    self.patterns.append("https://" + element.text.strip())
        except XMLSyntaxError:
            activity = Activity(
                activity_type="automation",
                text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "
                      "related to the format of the XML file within the zip file. Interestingly we were able to "
                      "extract the file from the zip file fine. "
                      "The automation task has not been disabled and will attempt to download the next dump when the "
                      "latest dump revision is incremented at the server."))
            activity.notify()
            db.session.add(activity)
            db.session.commit()
        logging.debug("Found %s patterns", len(self.patterns))