majuna/app/terraform/block_roskomsvoboda.py

113 lines
5.3 KiB
Python
Raw Normal View History

import json
import logging
from io import BytesIO
from typing import Any, Optional
from zipfile import ZipFile, BadZipFile
import lxml # nosec: B410
import requests
from lxml.etree import XMLSyntaxError # nosec: B410
from app.extensions import db
from app.models.activity import Activity
from app.models.tfstate import TerraformState
from app.terraform.block_mirror import BlockMirrorAutomation
# TODO: Security considerations for lxml
#
# This module makes use of lxml for parsing XML. There are some known issues relating to
# malicious XML being crafted to exploit XML parses such that they will exhaust available
# CPU and RAM. Here we use the event-driven parser and disable entity resolution so this
# should help to reduce the risks however a more in-depth review would be good in the future.
class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):
2022-06-17 12:42:42 +01:00
"""
Automation task to import Russian blocklist from RosKomSvoboda.
This task will import the Russian state register of prohibited sites,
which is part of the enforcement of federal laws of the Russian Federation
No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate
the dissemination of information on the Internet.
Where proxies are found to be blocked they will be rotated.
"""
short_name = "block_roskomsvoboda"
description = "Import Russian blocklist from RosKomSvoboda"
frequency = 300
_data: Any
def _fetch(self, latest_rev: str) -> None:
self._data = None
try:
# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
# data than that we are verifying the certificate.
r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", timeout=180, verify=False) # nosec: B501
r.raise_for_status()
zip_file = ZipFile(BytesIO(r.content))
self._data = zip_file.read("dump.xml")
logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)
except requests.HTTPError:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
except BadZipFile:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "
"related to the format of the zip file. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
2022-06-18 13:17:36 +01:00
def fetch(self) -> None:
state: Optional[TerraformState] = TerraformState.query.filter(
TerraformState.key == "block_roskomsvoboda").first()
if state is None:
state = TerraformState()
state.key = "block_roskomsvoboda"
db.session.add(state)
latest_metadata = {"dump_rev": "0"}
else:
latest_metadata = json.loads(state.state)
# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
# data than that we are verifying the certificate.
latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", timeout=30, verify=False).text.strip() # nosec: B501
logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])
if latest_rev != latest_metadata["dump_rev"]:
state.state = json.dumps({"dump_rev": latest_rev})
db.session.commit()
self._fetch(latest_rev)
else:
self._data = None
2022-06-18 13:17:36 +01:00
def parse(self) -> None:
if not self._data:
logging.debug("No new data to parse")
return
try:
for _event, element in lxml.etree.iterparse(BytesIO(self._data),
resolve_entities=False):
if element.tag == "domain":
2023-10-29 15:45:10 +00:00
self.patterns["roskomsvoboda"].append("https://" + element.text.strip())
except XMLSyntaxError:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "
"related to the format of the XML file within the zip file. Interestingly we were able to "
"extract the file from the zip file fine. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
logging.debug("Found %s patterns", len(self.patterns))