feat: draw the rest of the owl

This commit is contained in:
Iain Learmonth 2026-03-26 10:58:03 +00:00
parent e21b725192
commit 2ba848467f
28 changed files with 1538 additions and 448 deletions

View file

@ -1,19 +1,24 @@
import base64
import copy
import datetime
import logging
import mimetypes
from typing import Any
from urllib.parse import urlparse, urlunparse, urljoin
import minify_html
import requests
from babel.dates import format_date
from babel.support import Translations
from bs4 import BeautifulSoup
from jinja2 import Environment, PackageLoader, select_autoescape
from src.config import settings
from src.database import get_db_session
from src.mirrors.service import resolve_mirror
from src.pangea.client import pangea_expanded_image_url
from src.snapshots.config import SnapshotsConfig, config_for_url
from src.snapshots.schemas import SnapshotContext
from src.snapshots.service import resolve_snapshot
class SnapshotParseError(RuntimeError):
@ -74,7 +79,7 @@ def fetch_url(base: str, url: str) -> str | None:
return None
class Snapshot:
class SnapshotCamera:
config: SnapshotsConfig | None = None
context: SnapshotContext | None = None
raw: bytes | None = None
@ -158,9 +163,29 @@ class Snapshot:
element.decompose()
for image in body.select("img"):
image.attrs = {
"src": fetch_url(self.url, image["src"]),
"alt": image["alt"],
"src": fetch_url(
pangea_expanded_image_url(self.url),
image.get("src", image.get("data-src", "")),
),
"alt": image.get("alt", ""),
}
with get_db_session() as db:
for hyperlink in body.select("a"):
absolute_url = urljoin(self.url, hyperlink.get("href"))
existing_snapshot = resolve_snapshot(db, absolute_url)
if existing_snapshot:
hyperlink.attrs.update(
{"href": existing_snapshot, "class": "snap-link--snapshot"}
)
continue
mirror_url = resolve_mirror(db, absolute_url)
if mirror_url:
hyperlink.attrs.update(
{"href": mirror_url, "class": "snap-link--mirror"}
)
continue
hyperlink.attrs.update({"href": absolute_url})
return str(body)
def preprocess(self) -> None:
@ -173,16 +198,15 @@ class Snapshot:
element.attrs.pop("style")
def favicon(self):
icon = fetch_url(
self.url, self.get_attribute_value('link[rel="icon"]', "href", optional=True)
)
if icon:
favicon_src = self.get_attribute_value('link[rel="icon"]', "href", optional=True)
if favicon_src:
icon = fetch_url(self.url, favicon_src)
return icon
parsed = urlparse(self.url)
icon_url = urlunparse((parsed.scheme, parsed.netloc, "/favicon.ico", "", "", ""))
return fetch_url(self.url, icon_url)
def published_time(self, locale: str = "en") -> str:
def published_time(self, locale) -> str:
if self.config.article_published_selector:
if published := self.get_element_content(
self.config.article_published_selector, optional=True
@ -194,12 +218,28 @@ class Snapshot:
return format_date(ts, locale=locale)
def parse(self) -> None:
if not self.config:
self.config = config_for_url(self.url)
if not self.config:
return
self.soup = BeautifulSoup(self.raw, "lxml")
self.preprocess()
article_image_source = self.get_attribute_value(
self.config.article_image_selector, "src"
)
if self.config.article_image_selector:
article_image_source = self.get_attribute_value(
self.config.article_image_selector, "src"
)
article_image_source = pangea_expanded_image_url(article_image_source)
else:
article_image_source = None
page_language = self.get_attribute_value(["html", "body"], "lang", optional=True)
site_url = urlunparse(urlparse(self.url)._replace(path="/"))
with get_db_session() as db:
article_mirror_url = resolve_mirror(db, self.url)
site_mirror_url = (
urlunparse(urlparse(article_mirror_url)._replace(path="/"))
if article_mirror_url
else None
)
self.context = SnapshotContext(
article_author=self.get_element_content(
self.config.article_author_selector, optional=True
@ -208,7 +248,9 @@ class Snapshot:
article_description=self.get_attribute_value(
'meta[name="description"]', "content", optional=True
),
article_image=fetch_url(self.url, article_image_source),
article_image=fetch_url(self.url, article_image_source)
if article_image_source
else None,
article_image_caption=self.get_element_content(
self.config.article_image_caption_selector, optional=True
),
@ -216,20 +258,25 @@ class Snapshot:
article_published=self.published_time(page_language),
article_title=self.get_element_content(self.config.article_title_selector),
article_url=self.url,
article_mirror_url=article_mirror_url,
matomo_host=settings.MATOMO_HOST,
matomo_site_id=settings.MATOMO_SITE_ID,
page_direction=self.get_attribute_value(["html", "body"], "dir", optional=True),
page_language=page_language,
site_favicon=self.favicon(),
site_logo=fetch_file(self.config.site_logo),
site_title=self.config.site_title,
site_url=site_url,
site_mirror_url=site_mirror_url,
)
def get_context(self) -> dict[str, Any]:
logging.info("Get content")
self.get_content()
logging.info("Parse")
self.parse()
logging.info("Dump")
return self.context.model_dump()
def get_context(self) -> dict[str, Any] | None:
self.config = config_for_url(self.url)
if self.config:
self.get_content()
self.parse()
return self.context.model_dump()
return None
def render(self) -> str:
context = self.get_context()
@ -246,4 +293,6 @@ class Snapshot:
translations = Translations.load("i18n", [context["page_language"], "en"])
jinja_env.install_gettext_translations(translations)
template = jinja_env.get_template("article-template.html.j2")
return template.render(**context)
return minify_html.minify(
template.render(**context), minify_js=True, minify_css=True
)

View file

@ -1,9 +1,54 @@
from datetime import datetime
from enum import Enum
from sqlalchemy.orm import Mapped
from src.models import CustomBase, IdMixin
from src.models import (
CustomBase,
IdMixin,
DeletedTimestampMixin,
TimestampMixin,
)
from src.google.config import settings as google_settings
from src.utils import hashids
class Snapshot(CustomBase, IdMixin):
class SnapshotProvider(Enum):
GOOGLE = "google"
# TODO: when adding make sure to update alembic migration with
# op.execute("ALTER TYPE snapshotprovider ADD VALUE 'aws'")
# AWS = "aws"
# OVH = "ovh"
# ORACLE = "oracle"
# class SnapshotConfiguration(CustomBase, IdMixin, TimestampMixin, DeletedTimestampMixin, DescriptionMixin):
# __tablename__ = "snapshot_template"
#
# domain: Mapped[str]
# path: Mapped[str]
# configuration: Mapped[dict[str, Any]]
class SnapshotState(Enum):
PENDING = "pending"
FAILED = "failed"
UPDATING = "updating"
FROZEN = "frozen"
EXPIRED = "expired"
class Snapshot(CustomBase, IdMixin, TimestampMixin, DeletedTimestampMixin):
__tablename__ = "snapshot"
url: Mapped[str]
pool: Mapped[int]
snapshot_state: Mapped[SnapshotState]
provider: Mapped[SnapshotProvider]
snapshot_published_at: Mapped[datetime | None]
@property
def link(self) -> str:
if self.provider == SnapshotProvider.GOOGLE:
return f"https://storage.googleapis.com/{google_settings.BUCKET_NAME}/{hashids.encode(self.id)}.html"
return "unknown-provider" # impossible because all enum options

View file

@ -2,48 +2,56 @@ from fastapi import APIRouter, HTTPException, BackgroundTasks
from starlette import status
from starlette.responses import HTMLResponse
from src.database import DbSession
from src.security import ApiKey
from src.snapshots.config import config_for_url
from src.snapshots.models import Snapshot, SnapshotState, SnapshotProvider
from src.config import settings
from src.google.config import settings as google_settings
from src.snapshots.client import Snapshot
from src.snapshots.client import SnapshotCamera
from src.snapshots.schemas import SnapshotContext
from src.snapshots.tasks import upload_snapshot
from src.snapshots.tasks import generate_snapshot
router = APIRouter()
@router.get(
"/debug/context",
summary="Generate the context used by the snapshot template for debugging purposes. Endpoint disabled on production deployments.",
"/api/v1/snap-context",
summary="Generate the context used by the snapshot template for debugging purposes.",
response_model=SnapshotContext,
)
def context(url: str = "https://www.bbc.com/russian/articles/ckgeey4dqgxo"):
if settings.ENVIRONMENT.is_debug:
return Snapshot(url).get_context()
def context(auth: ApiKey, url: str = "https://www.bbc.com/russian/articles/ckgeey4dqgxo"):
if settings.ENVIRONMENT.is_debug or auth:
return SnapshotCamera(url).get_context()
raise HTTPException(status.HTTP_404_NOT_FOUND)
@router.get(
"/debug/demo",
summary="Generate a rendered snapshot template for debugging purposes. Endpoint disabled on production deployments.",
"/api/v1/snap-preview",
summary="Generate a rendered snapshot template for debugging purposes.",
response_class=HTMLResponse,
)
def parse(url: str = "https://www.bbc.com/russian/articles/ckgeey4dqgxo"):
if settings.ENVIRONMENT.is_debug:
return Snapshot(url).render()
def parse(auth: ApiKey, url: str = "https://www.bbc.com/russian/articles/ckgeey4dqgxo"):
if settings.ENVIRONMENT.is_debug or auth:
return SnapshotCamera(url).render()
raise HTTPException(status.HTTP_404_NOT_FOUND)
@router.get(
"/debug/upload",
summary="Generate a rendered snapshot template for debugging purposes and upload to Google Cloud Storage. Endpoint disabled on production deployments.",
response_class=HTMLResponse,
"/api/v1/snap",
summary="Generate a rendered snapshot template and upload to Google Cloud Storage.",
)
def upload(
def snap(
background_tasks: BackgroundTasks,
db: DbSession,
auth: ApiKey,
url: str = "https://www.bbc.com/russian/articles/ckgeey4dqgxo",
):
if settings.ENVIRONMENT.is_debug:
rendered = Snapshot(url).render()
background_tasks.add_task(upload_snapshot, "debug2.html", rendered)
return f'<a href="https://storage.googleapis.com/{google_settings.BUCKET_NAME}/debug.html">Google Cloud Storage</a>'
raise HTTPException(status.HTTP_404_NOT_FOUND)
s = db.query(Snapshot).filter(Snapshot.url == url, Snapshot.pool == 0).first()
if not s and config_for_url(url):
s = Snapshot(url=url, pool=0, snapshot_state=SnapshotState.PENDING, provider=SnapshotProvider.GOOGLE)
db.add(s)
db.commit()
background_tasks.add_task(generate_snapshot, s.id)
if s:
return {"url": s.link}
return status.HTTP_403_FORBIDDEN

View file

@ -11,8 +11,13 @@ class SnapshotContext(BaseModel):
article_published: str
article_title: str
article_url: str
article_mirror_url: str | None = None
matomo_host: str
matomo_site_id: int
page_direction: str | None = None
page_language: str | None = None
site_favicon: str | None = None
site_logo: str = None
site_title: str
site_mirror_url: str | None = None
site_url: str

8
src/snapshots/service.py Normal file
View file

@ -0,0 +1,8 @@
from sqlalchemy.orm import Session
from src.snapshots.models import Snapshot
def resolve_snapshot(db: Session, url: str) -> str | None:
s = db.query(Snapshot).filter(Snapshot.url == url, Snapshot.pool == 0).first()
return s.link if s else None

View file

@ -1,5 +1,29 @@
import logging
from datetime import datetime
from src.database import get_db_session
from src.snapshots.client import SnapshotCamera
from src.snapshots.models import Snapshot, SnapshotState
from src.google.client import upload_blob
from src.utils import hashids
def upload_snapshot(filename: str, content: str) -> None:
upload_blob(filename, content.encode("utf-8"), "text/html")
def generate_snapshot(id_: int) -> None:
with get_db_session() as db:
snapshot = (
db.query(Snapshot)
.filter(Snapshot.id == id_, Snapshot.snapshot_state == SnapshotState.PENDING)
.first()
)
if not snapshot:
return
try:
content = SnapshotCamera(snapshot.url).render()
upload_blob(hashids.encode(snapshot.id) + ".html", content.encode("utf-8"), "text/html")
snapshot.snapshot_state = SnapshotState.UPDATING
snapshot.snapshot_published_at = datetime.now()
db.commit()
except Exception as e:
logging.error(e)
snapshot.snapshot_state = SnapshotState.FAILED
db.commit()

View file

@ -1,9 +1,8 @@
{% from "article.css.j2" import article_css %}<!DOCTYPE html>
<html {% if page_direction %} dir="{{ page_direction }}"{% endif %}{% if page_language %} lang="{{ page_language }}"{% endif %} prefix="og: https://ogp.me/ns#">
<base href="{{ article_url }}" />
<head>
<meta charset="utf-8">
<title>{{ article_title }}</title>
<title>{{ article_title.strip() }}</title>
<meta name="viewport"
content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, user-scalable=no"/>
<meta name="format-detection" content="telephone=no"/>
@ -12,7 +11,7 @@
<meta name="HandheldFriendly" content="True"/>
<meta property="og:type" content="article"/>
<meta property="og:title" content="{{ article_title }}"/>
<meta property="og:title" content="{{ article_title.strip() }}"/>
<meta property="og:site_name" content="{{ site_title }}"/>
<meta property="og:url" content="{{ article_url }}"/>
{% if article_image_source %}
@ -20,9 +19,7 @@
{% endif %}
<meta name="twitter:card" content="summary_large_image"/>
{% if article_author %}<meta property="article:author" content="{{ article_author }}"/>{% endif %}
{% if noindex %}<meta name="robots" content="noindex" />{% endif %}
<meta name="robots" content="noindex" />
{% if site_favicon %}
<link rel="icon" href="{{ site_favicon }}" />
{% endif %}
@ -47,19 +44,36 @@
let target = e.target.closest("a");
if (target) {
// if the click was on or within an <a>
if (!target.href.includes("cloudfront.net") &&
!target.href.includes("azureedge.net") &&
!target.href.includes("global.ssl.fastly.net")) {
if (!target.className.includes("snap-skip-link") &&
!target.className.includes("snap-link--mirror") &&
!target.className.includes("snap-link--snapshot")) {
e.preventDefault();
document.body.dataset.currentLink = target.href;
}
}
});
var _paq = window._paq = window._paq || [];
var p = "{{ article_url }}";
_paq.push(["setCustomUrl", p]);
_paq.push(["setExcludedQueryParams", ["roomName", "account", "accountnum", "address", "address1", "address2", "address3", "addressline1", "addressline2", "adres", "adresse", "age", "alter", "auth", "authpw", "bic", "billingaddress", "billingaddress1", "billingaddress2", "calle", "cardnumber", "cc", "ccc", "cccsc", "cccvc", "cccvv", "ccexpiry", "ccexpmonth", "ccexpyear", "ccname", "ccnumber", "cctype", "cell", "cellphone", "city", "clientid", "clientsecret", "company", "consumerkey", "consumersecret", "contrasenya", "contrase\u00f1a", "creditcard", "creditcardnumber", "cvc", "cvv", "dateofbirth", "debitcard", "direcci\u00f3n", "dob", "domain", "ebost", "email", "emailaddress", "emailadresse", "epos", "epost", "eposta", "exp", "familyname", "firma", "firstname", "formlogin", "fullname", "gender", "geschlecht", "gst", "gstnumber", "handynummer", "has\u0142o", "heslo", "iban", "ibanaccountnum", "ibanaccountnumber", "id", "identifier", "indirizzo", "kartakredytowa", "kennwort", "keyconsumerkey", "keyconsumersecret", "konto", "kontonr", "kontonummer", "kredietkaart", "kreditkarte", "kreditkort", "lastname", "login", "mail", "mobiili", "mobile", "mobilne", "nachname", "name", "nickname", "false", "osoite", "parole", "pass", "passord", "password", "passwort", "pasword", "paswort", "paword", "phone", "pin", "plz", "postalcode", "postcode", "postleitzahl", "privatekey", "publickey", "pw", "pwd", "pword", "pwrd", "rue", "secret", "secretq", "secretquestion", "shippingaddress", "shippingaddress1", "shippingaddress2", "socialsec", "socialsecuritynumber", "socsec", "sokak", "ssn", "steuernummer", "strasse", "street", "surname", "swift", "tax", "taxnumber", "tel", "telefon", "telefonnr", "telefonnummer", "telefono", "telephone", "token", "token_auth", "tokenauth", "t\u00e9l\u00e9phone", "ulica", "user", "username", "vat", "vatnumber", "via", "vorname", "wachtwoord", "wagwoord", "webhooksecret", "website", "zip", "zipcode"]]);
_paq.push(["trackPageView", p]);
_paq.push(['enableLinkTracking']);
(function () {
var u = "//{{ matomo_host }}/";
_paq.push(['setTrackerUrl', u + 'matomo.php']);
_paq.push(['setSiteId', '{{ matomo_site_id }}']);
var d = document, g = d.createElement('script'), s = d.getElementsByTagName('script')[0];
g.async = true;
g.src = u + 'matomo.js';
s.parentNode.insertBefore(g, s);
})();
</script>
</head>
<body>
<div class="snap-wrapper">
<a href="#snap-main" class="snap-skip-link">Skip to main content</a>
<a href="#snap-main" class="snap-skip-link">{{ gettext("Skip to main content") }}</a>
<details class="snap-trust-header">
<summary class="snap-trust-header__header">
@ -89,14 +103,14 @@
<header class="snap-page-header">
<nav class="snap-page-header-nav">
{% if article_mirror_url %}<a href="{{ article_mirror_url }}">{% endif %}
{% if site_mirror_url %}<a href="{{ site_mirror_url }}" class="snap-link--mirror">{% endif %}
<img src="{{ site_logo }}" alt="{{ site_title }}" class="snap-page-header-logo">
{% if article_mirror_url %}</a>{% endif %}
{% if site_mirror_url %}</a>{% endif %}
</nav>
</header>
<main id="snap-main">
<header class="snap-article-header">
<h1>{{ article_title }}</h1>
<h1>{{ article_title.strip() }}</h1>
<div class="snap-byline">
{{ article_published }} - {{ site_title }}
</div>
@ -113,8 +127,8 @@
{{ article_body }}
{% if article_mirror_url %}
<p>
<a href="{{ article_mirror_url }}" class="snap-footer-link">
View the original article
<a href="{{ article_mirror_url }}" class="snap-footer-link snap-link--mirror">
{{ gettext("View the original article") }}
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M19.7212 13.0822C19.3072 13.0822 18.9712 13.4189 18.9712 13.8322V18.9712H5.02881V5.02881H10.167C10.5818 5.02881 10.917 4.69279 10.917 4.27881C10.917 3.86483 10.5818 3.52881 10.167 3.52881H4.27881C3.86405 3.52881 3.52881 3.86483 3.52881 4.27881V19.7212C3.52881 20.136 3.86405 20.4712 4.27881 20.4712H19.7212C20.136 20.4712 20.4712 20.136 20.4712 19.7212V13.8322C20.4712 13.4197 20.136 13.0822 19.7212 13.0822Z"
fill="#222F3A"></path>
@ -128,7 +142,7 @@
</main>
<footer class="snap-footer">
<div>
{% if site_mirror_url %}<a href="https://d7qg4uz16a7xs.cloudfront.net/">{% endif %}
{% if site_mirror_url %}<a href="{{ site_mirror_url }}">{% endif %}
<img src="{{ site_logo }}" alt="{{ site_title }} logo">
{% if site_mirror_url %}</a>{% endif %}
</div>

View file

@ -144,8 +144,8 @@ figcaption {
box-sizing: border-box;
color: #333;
display: inline-block;
max-width: 335px;
width: 100%;
width: auto;
max-width: 100%;
border: 1px solid #e0dfdd;
border-radius: 4px;
padding: 16px 24px;
@ -168,11 +168,19 @@ figcaption {
}
.snap-footer-link svg {
margin-top: 3px;
}
.snap-footer-link svg:dir(ltr) {
float: right;
margin-left: 10px;
}
.snap-footer-link:dir(rtl) svg {
float: left;
-webkit-transform: scaleX(-1);
transform: scaleX(-1);
margin-right: 10px;
}
.snap-footer-link--disabled {