2023-04-26 16:21:12 +01:00
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
from io import BytesIO
|
|
|
|
from typing import Any, Optional
|
2024-12-06 18:02:59 +00:00
|
|
|
from zipfile import BadZipFile, ZipFile
|
2022-05-09 14:11:05 +01:00
|
|
|
|
2023-05-03 14:47:25 +01:00
|
|
|
import lxml # nosec: B410
|
2022-05-09 14:11:05 +01:00
|
|
|
import requests
|
2023-05-03 14:47:25 +01:00
|
|
|
from lxml.etree import XMLSyntaxError # nosec: B410
|
2022-05-09 14:11:05 +01:00
|
|
|
|
2023-04-26 16:21:12 +01:00
|
|
|
from app.extensions import db
|
|
|
|
from app.models.activity import Activity
|
|
|
|
from app.models.tfstate import TerraformState
|
2022-06-18 13:01:18 +01:00
|
|
|
from app.terraform.block_mirror import BlockMirrorAutomation
|
2022-05-09 14:11:05 +01:00
|
|
|
|
2023-05-03 14:47:25 +01:00
|
|
|
# TODO: Security considerations for lxml
|
|
|
|
#
|
|
|
|
# This module makes use of lxml for parsing XML. There are some known issues relating to
|
|
|
|
# malicious XML being crafted to exploit XML parses such that they will exhaust available
|
|
|
|
# CPU and RAM. Here we use the event-driven parser and disable entity resolution so this
|
|
|
|
# should help to reduce the risks however a more in-depth review would be good in the future.
|
|
|
|
|
2022-05-09 14:11:05 +01:00
|
|
|
|
2022-06-18 13:01:18 +01:00
|
|
|
class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):
|
2022-06-17 12:42:42 +01:00
|
|
|
"""
|
|
|
|
Automation task to import Russian blocklist from RosKomSvoboda.
|
|
|
|
|
|
|
|
This task will import the Russian state register of prohibited sites,
|
|
|
|
which is part of the enforcement of federal laws of the Russian Federation
|
|
|
|
No. 139-FZ, No. 187-FZ, No. 398-FZ and a number of others that regulate
|
|
|
|
the dissemination of information on the Internet.
|
|
|
|
|
|
|
|
Where proxies are found to be blocked they will be rotated.
|
|
|
|
"""
|
2022-05-09 14:11:05 +01:00
|
|
|
short_name = "block_roskomsvoboda"
|
|
|
|
description = "Import Russian blocklist from RosKomSvoboda"
|
2022-11-13 13:21:40 +00:00
|
|
|
frequency = 300
|
2022-05-09 14:11:05 +01:00
|
|
|
|
2022-06-18 12:36:54 +01:00
|
|
|
_data: Any
|
|
|
|
|
2023-05-03 14:47:25 +01:00
|
|
|
def _fetch(self, latest_rev: str) -> None:
|
2023-04-26 16:21:12 +01:00
|
|
|
self._data = None
|
|
|
|
try:
|
2024-02-19 12:17:49 +00:00
|
|
|
# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
|
|
|
|
# data than that we are verifying the certificate.
|
|
|
|
r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", timeout=180, verify=False) # nosec: B501
|
2023-04-26 16:21:12 +01:00
|
|
|
r.raise_for_status()
|
|
|
|
zip_file = ZipFile(BytesIO(r.content))
|
|
|
|
self._data = zip_file.read("dump.xml")
|
|
|
|
logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)
|
|
|
|
except requests.HTTPError:
|
|
|
|
activity = Activity(
|
|
|
|
activity_type="automation",
|
|
|
|
text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "
|
|
|
|
"The automation task has not been disabled and will attempt to download the next dump when the "
|
|
|
|
"latest dump revision is incremented at the server."))
|
|
|
|
activity.notify()
|
|
|
|
db.session.add(activity)
|
|
|
|
db.session.commit()
|
|
|
|
except BadZipFile:
|
|
|
|
activity = Activity(
|
|
|
|
activity_type="automation",
|
|
|
|
text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "
|
|
|
|
"related to the format of the zip file. "
|
|
|
|
"The automation task has not been disabled and will attempt to download the next dump when the "
|
|
|
|
"latest dump revision is incremented at the server."))
|
|
|
|
activity.notify()
|
|
|
|
db.session.add(activity)
|
|
|
|
db.session.commit()
|
|
|
|
|
2022-06-18 13:17:36 +01:00
|
|
|
def fetch(self) -> None:
|
2023-04-26 16:21:12 +01:00
|
|
|
state: Optional[TerraformState] = TerraformState.query.filter(
|
|
|
|
TerraformState.key == "block_roskomsvoboda").first()
|
|
|
|
if state is None:
|
|
|
|
state = TerraformState()
|
|
|
|
state.key = "block_roskomsvoboda"
|
|
|
|
db.session.add(state)
|
|
|
|
latest_metadata = {"dump_rev": "0"}
|
|
|
|
else:
|
|
|
|
latest_metadata = json.loads(state.state)
|
2024-02-19 12:17:49 +00:00
|
|
|
# This endpoint routinely has an expired certificate, and it's more useful that we are consuming the
|
|
|
|
# data than that we are verifying the certificate.
|
|
|
|
latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", timeout=30, verify=False).text.strip() # nosec: B501
|
2023-04-26 16:21:12 +01:00
|
|
|
logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])
|
|
|
|
if latest_rev != latest_metadata["dump_rev"]:
|
|
|
|
state.state = json.dumps({"dump_rev": latest_rev})
|
|
|
|
db.session.commit()
|
|
|
|
self._fetch(latest_rev)
|
|
|
|
else:
|
|
|
|
self._data = None
|
2022-06-18 12:36:54 +01:00
|
|
|
|
2022-06-18 13:17:36 +01:00
|
|
|
def parse(self) -> None:
|
2023-04-26 16:21:12 +01:00
|
|
|
if not self._data:
|
|
|
|
logging.debug("No new data to parse")
|
|
|
|
return
|
|
|
|
try:
|
2023-05-03 14:47:25 +01:00
|
|
|
for _event, element in lxml.etree.iterparse(BytesIO(self._data),
|
|
|
|
resolve_entities=False):
|
2023-04-26 16:21:12 +01:00
|
|
|
if element.tag == "domain":
|
2023-10-29 15:45:10 +00:00
|
|
|
self.patterns["roskomsvoboda"].append("https://" + element.text.strip())
|
2023-04-26 16:21:12 +01:00
|
|
|
except XMLSyntaxError:
|
|
|
|
activity = Activity(
|
|
|
|
activity_type="automation",
|
|
|
|
text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "
|
|
|
|
"related to the format of the XML file within the zip file. Interestingly we were able to "
|
|
|
|
"extract the file from the zip file fine. "
|
|
|
|
"The automation task has not been disabled and will attempt to download the next dump when the "
|
|
|
|
"latest dump revision is incremented at the server."))
|
|
|
|
activity.notify()
|
|
|
|
db.session.add(activity)
|
|
|
|
db.session.commit()
|
|
|
|
logging.debug("Found %s patterns", len(self.patterns))
|