diff --git a/app/terraform/block_mirror.py b/app/terraform/block_mirror.py index f86c455..58668e7 100644 --- a/app/terraform/block_mirror.py +++ b/app/terraform/block_mirror.py @@ -12,12 +12,14 @@ from app.terraform import BaseAutomation class BlockMirrorAutomation(BaseAutomation): patterns: List[str] + _data: Any def __init__(self, *args: Any, **kwargs: Any) -> None: """ Constructor method. """ self.patterns = [] + self._data = None super().__init__(*args, **kwargs) def automate(self, full: bool = False) -> Tuple[bool, str]: diff --git a/app/terraform/block_roskomsvoboda.py b/app/terraform/block_roskomsvoboda.py index 29b6162..d4b60c0 100644 --- a/app/terraform/block_roskomsvoboda.py +++ b/app/terraform/block_roskomsvoboda.py @@ -1,7 +1,16 @@ -from typing import Any +import json +import logging +from io import BytesIO +from typing import Any, Optional +from zipfile import ZipFile, BadZipFile +import lxml import requests +from lxml.etree import XMLSyntaxError +from app.extensions import db +from app.models.activity import Activity +from app.models.tfstate import TerraformState from app.terraform.block_mirror import BlockMirrorAutomation @@ -22,9 +31,72 @@ class BlockRoskomsvobodaAutomation(BlockMirrorAutomation): _data: Any + def _fetch(self, latest_rev) -> None: + self._data = None + try: + r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}", + verify=False, timeout=180) + r.raise_for_status() + zip_file = ZipFile(BytesIO(r.content)) + self._data = zip_file.read("dump.xml") + logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev) + except requests.HTTPError: + activity = Activity( + activity_type="automation", + text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. " + "The automation task has not been disabled and will attempt to download the next dump when the " + "latest dump revision is incremented at the server.")) + activity.notify() + db.session.add(activity) + db.session.commit() + except BadZipFile: + activity = Activity( + activity_type="automation", + text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error " + "related to the format of the zip file. " + "The automation task has not been disabled and will attempt to download the next dump when the " + "latest dump revision is incremented at the server.")) + activity.notify() + db.session.add(activity) + db.session.commit() + def fetch(self) -> None: - self._data = requests.get("https://reestr.rublacklist.net/api/v3/domains/", - timeout=180).json() + state: Optional[TerraformState] = TerraformState.query.filter( + TerraformState.key == "block_roskomsvoboda").first() + if state is None: + state = TerraformState() + state.key = "block_roskomsvoboda" + db.session.add(state) + latest_metadata = {"dump_rev": "0"} + else: + latest_metadata = json.loads(state.state) + latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest", + verify=False, timeout=30).text.strip() + logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"]) + if latest_rev != latest_metadata["dump_rev"]: + state.state = json.dumps({"dump_rev": latest_rev}) + db.session.commit() + self._fetch(latest_rev) + else: + self._data = None def parse(self) -> None: - self.patterns.extend(["https://" + pattern for pattern in self._data]) + if not self._data: + logging.debug("No new data to parse") + return + try: + for _event, element in lxml.etree.iterparse(BytesIO(self._data)): + if element.tag == "domain": + self.patterns.append("https://" + element.text.strip()) + except XMLSyntaxError: + activity = Activity( + activity_type="automation", + text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error " + "related to the format of the XML file within the zip file. Interestingly we were able to " + "extract the file from the zip file fine. " + "The automation task has not been disabled and will attempt to download the next dump when the " + "latest dump revision is incremented at the server.")) + activity.notify() + db.session.add(activity) + db.session.commit() + logging.debug("Found %s patterns", len(self.patterns))