pygea/pygea/utilities.py
2026-03-29 13:48:30 +02:00

210 lines
6.3 KiB
Python

# pylint: disable-msg=C0201
"""
- * -
Utilities for the Pangea CMS Service API
- * -
"""
import hashlib
import os
from configparser import ConfigParser, NoOptionError, NoSectionError
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
def acquire(url):
"""Simple wrapper over the request object."""
response = requests.get(url, timeout=20)
# Check if the request was successful
if response.status_code == 200:
content = response.text
else:
print(
"Failed to retrieve the web page. Status code: " + str(response.status_code)
)
return None
return content
def parse_url_elements(url):
"""URL hackery - returns domain and Pangea article ID from a provided URL"""
out = {}
parts = urlparse(url)
out["domain"] = parts.hostname
# article ID is the file name at the end of the path ('324534.html')
more_parts = parts.path.split("/")
file = more_parts[len(more_parts) - 1]
file_parts = file.split(".")
out["article_id"] = file_parts[0]
return out
def get_webpage_metadata(page_url):
"""Get HTML metadata elements from a webpage."""
parsed = urlparse(page_url)
domain = parsed.netloc
#
# USAGM websites support the OpenGraph tags which provide most
# of the metadata we require.
#
html_content = acquire(page_url)
if html_content == None:
return None
soup = BeautifulSoup(html_content, "html.parser")
meta_tags = soup.find_all("meta")
metadata = {}
for tag in meta_tags:
if "name" in tag.attrs:
name = tag.attrs["name"]
content = tag.attrs.get("content", "")
metadata[name] = content
elif "property" in tag.attrs: # For OpenGraph metadata
prop = tag.attrs["property"]
content = tag.attrs.get("content", "")
metadata[prop] = content
# add useful language property
html = soup.find_all("html")
metadata["language"] = html[0]["lang"]
# add links
link_tags = soup.find_all("link")
for tag in link_tags:
if "rel" in tag.attrs:
# print(json.dumps(tag.attrs, indent=4))
if "alternate" in tag.attrs["rel"]:
if "icon" in tag.attrs["rel"]:
metadata["favicon"] = "https://" + domain + tag.attrs.get("href")
if tag.attrs["rel"][0] == "canonical":
metadata["canonical"] = tag.attrs.get("href")
return metadata
def get_media_metadata(image_url):
"""Get metadata for media content from website (via response headers)."""
response = requests.head(image_url, timeout=20)
meta = None
if response.status_code == 200:
meta = {
"content_type": response.headers["Content-Type"],
"content_length": response.headers["Content-Length"],
}
return meta
def make_boolean(bool_str):
"""Convert a boolean string to an actual Boolean."""
in_str = bool_str.lower()
if (in_str != "true") & (in_str != "false"):
return True # following Python conventions
if in_str == "true":
return True
return False
def get_api_key():
"""Return the API key. PYGEA_API_KEY env var takes precedence over pygea.ini.
Returns None if neither source provides a value."""
env_key = os.environ.get("PYGEA_API_KEY")
if env_key:
return env_key
config = ConfigParser()
config.read("pygea.ini")
try:
return config.get("runtime", "api_key")
except (NoSectionError, NoOptionError):
return None
def get_configuration_variable(section, vname):
"""Retrieve values from the configuration file."""
config = ConfigParser()
config.read("pygea.ini")
value = config.get(section, vname)
if (value == "True") | (value == "False"):
value = make_boolean(value)
return value
def is_domain_name(domain):
"""Does the provided string resemble a domain name?"""
if any(char in domain for char in "."):
return True
return False
def hash_site_metadata(metadata):
"""Create a secure hash of website HTTP meta headers to use as an RSS/ATOM ID."""
sh = hashlib.sha256()
for key in metadata.keys():
sh.update(key.encode("utf8") + metadata[key].encode("utf8"))
digest = sh.hexdigest()
return digest
def rss_namespace_supported(prop):
"""Determine if a provided RSS/XML namespace is valid in the FeedGen RSS package."""
supported_namespaces = [
"dc",
"geo",
"gen_entry",
"media",
"podcast",
"podcast_entry",
"syndication",
"torrent",
]
if prop in supported_namespaces:
return True
return False
def rss_namespace_for_property(prop):
"""Returns the XML namespace for a specified <channel> or <item>
property from among a list of the most popular namespace schemes
according to:
https://www.rssboard.org/news/168/rss-channel-element-usage-stats
For an exhaustive list of namespace schemes see:
https://validator.w3.org/feed/docs/howto/declare_namespaces.html
"""
known_namespaces = {
"content": "http://purl.org/rss/1.0/modules/content/", # content
"dc": "http://purl.org/dc/elements/1.1/", # Dublin Core
"atom": "http://www.w3.org/2005/Atom", # ATOM
"sy": "http://purl.org/rss/1.0/modules/syndication/", # Syndication
"admin": "http://webns.net/mvcb/",
"feedburner": "http://rssnamespace.org/feedburner/ext/1.0", # Feedburner
"cc": "http://web.resource.org/cc/", # copyrights
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
"opensearch": "http://a9.com/-/spec/opensearch/1.1/", # OpenSearch
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", # Apple iTunes
"blogChannel": "http://backend.userland.com/blogChannelModule", # BlogChannel
"media": "http://search.yahoo.com/mrss/", # media RSS
"icbm": "http://postneo.com/icbm", # ICBM
"cf": "http://www.microsoft.com/schemas/rss/core/2005", # a Microsoft thing
"podcast": "https://podcastindex.org/namespace/1.0", # Podcast RSS
"xhtml": "http://www.w3.org/1999/xhtml", # XHTML
}
components = prop.split(":")
if known_namespaces.get(components[0]):
return known_namespaces[components[0]]
return None