feat(block): use roskomsvoboda private api

This commit is contained in:
Iain Learmonth 2023-04-26 16:21:12 +01:00
parent 19681d1eca
commit fb1341365f
2 changed files with 78 additions and 4 deletions

View file

@ -1,7 +1,16 @@
from typing import Any
import json
import logging
from io import BytesIO
from typing import Any, Optional
from zipfile import ZipFile, BadZipFile
import lxml
import requests
from lxml.etree import XMLSyntaxError
from app.extensions import db
from app.models.activity import Activity
from app.models.tfstate import TerraformState
from app.terraform.block_mirror import BlockMirrorAutomation
@ -22,9 +31,72 @@ class BlockRoskomsvobodaAutomation(BlockMirrorAutomation):
_data: Any
def _fetch(self, latest_rev) -> None:
self._data = None
try:
r = requests.get(f"https://dumps.rublacklist.net/fetch/{latest_rev}",
verify=False, timeout=180)
r.raise_for_status()
zip_file = ZipFile(BytesIO(r.content))
self._data = zip_file.read("dump.xml")
logging.debug("Downloaded %s bytes in dump %s", len(self._data), latest_rev)
except requests.HTTPError:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to download dump {latest_rev} due to HTTP error {r.status_code}. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
except BadZipFile:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to extract zip file from dump {latest_rev}. There was an error "
"related to the format of the zip file. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
def fetch(self) -> None:
self._data = requests.get("https://reestr.rublacklist.net/api/v3/domains/",
timeout=180).json()
state: Optional[TerraformState] = TerraformState.query.filter(
TerraformState.key == "block_roskomsvoboda").first()
if state is None:
state = TerraformState()
state.key = "block_roskomsvoboda"
db.session.add(state)
latest_metadata = {"dump_rev": "0"}
else:
latest_metadata = json.loads(state.state)
latest_rev = requests.get("https://dumps.rublacklist.net/fetch/latest",
verify=False, timeout=30).text.strip()
logging.debug("Latest revision is %s, already got %s", latest_rev, latest_metadata["dump_rev"])
if latest_rev != latest_metadata["dump_rev"]:
state.state = json.dumps({"dump_rev": latest_rev})
db.session.commit()
self._fetch(latest_rev)
else:
self._data = None
def parse(self) -> None:
self.patterns.extend(["https://" + pattern for pattern in self._data])
if not self._data:
logging.debug("No new data to parse")
return
try:
for _event, element in lxml.etree.iterparse(BytesIO(self._data)):
if element.tag == "domain":
self.patterns.append("https://" + element.text.strip())
except XMLSyntaxError:
activity = Activity(
activity_type="automation",
text=(f"[{self.short_name}] 🚨 Unable to parse XML file from dump. There was an error "
"related to the format of the XML file within the zip file. Interestingly we were able to "
"extract the file from the zip file fine. "
"The automation task has not been disabled and will attempt to download the next dump when the "
"latest dump revision is incremented at the server."))
activity.notify()
db.session.add(activity)
db.session.commit()
logging.debug("Found %s patterns", len(self.patterns))