210 lines
6.3 KiB
Python
210 lines
6.3 KiB
Python
# pylint: disable-msg=C0201
|
|
"""
|
|
- * -
|
|
Utilities for the Pangea CMS Service API
|
|
|
|
- * -
|
|
"""
|
|
import hashlib
|
|
import os
|
|
from configparser import ConfigParser, NoOptionError, NoSectionError
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def acquire(url):
|
|
"""Simple wrapper over the request object."""
|
|
response = requests.get(url, timeout=20)
|
|
|
|
# Check if the request was successful
|
|
if response.status_code == 200:
|
|
content = response.text
|
|
else:
|
|
print(
|
|
"Failed to retrieve the web page. Status code: " + str(response.status_code)
|
|
)
|
|
return None
|
|
|
|
return content
|
|
|
|
|
|
def parse_url_elements(url):
|
|
"""URL hackery - returns domain and Pangea article ID from a provided URL"""
|
|
out = {}
|
|
|
|
parts = urlparse(url)
|
|
out["domain"] = parts.hostname
|
|
|
|
# article ID is the file name at the end of the path ('324534.html')
|
|
more_parts = parts.path.split("/")
|
|
file = more_parts[len(more_parts) - 1]
|
|
file_parts = file.split(".")
|
|
out["article_id"] = file_parts[0]
|
|
|
|
return out
|
|
|
|
|
|
def get_webpage_metadata(page_url):
|
|
"""Get HTML metadata elements from a webpage."""
|
|
parsed = urlparse(page_url)
|
|
domain = parsed.netloc
|
|
#
|
|
# USAGM websites support the OpenGraph tags which provide most
|
|
# of the metadata we require.
|
|
#
|
|
html_content = acquire(page_url)
|
|
if html_content == None:
|
|
return None
|
|
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
meta_tags = soup.find_all("meta")
|
|
|
|
metadata = {}
|
|
for tag in meta_tags:
|
|
if "name" in tag.attrs:
|
|
name = tag.attrs["name"]
|
|
content = tag.attrs.get("content", "")
|
|
metadata[name] = content
|
|
elif "property" in tag.attrs: # For OpenGraph metadata
|
|
prop = tag.attrs["property"]
|
|
content = tag.attrs.get("content", "")
|
|
metadata[prop] = content
|
|
|
|
# add useful language property
|
|
html = soup.find_all("html")
|
|
metadata["language"] = html[0]["lang"]
|
|
|
|
# add links
|
|
link_tags = soup.find_all("link")
|
|
for tag in link_tags:
|
|
if "rel" in tag.attrs:
|
|
# print(json.dumps(tag.attrs, indent=4))
|
|
if "alternate" in tag.attrs["rel"]:
|
|
if "icon" in tag.attrs["rel"]:
|
|
metadata["favicon"] = "https://" + domain + tag.attrs.get("href")
|
|
if tag.attrs["rel"][0] == "canonical":
|
|
metadata["canonical"] = tag.attrs.get("href")
|
|
|
|
return metadata
|
|
|
|
|
|
def get_media_metadata(image_url):
|
|
"""Get metadata for media content from website (via response headers)."""
|
|
response = requests.head(image_url, timeout=20)
|
|
meta = None
|
|
if response.status_code == 200:
|
|
meta = {
|
|
"content_type": response.headers["Content-Type"],
|
|
"content_length": response.headers["Content-Length"],
|
|
}
|
|
|
|
return meta
|
|
|
|
|
|
def make_boolean(bool_str):
|
|
"""Convert a boolean string to an actual Boolean."""
|
|
in_str = bool_str.lower()
|
|
if (in_str != "true") & (in_str != "false"):
|
|
return True # following Python conventions
|
|
|
|
if in_str == "true":
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def get_api_key():
|
|
"""Return the API key. PYGEA_API_KEY env var takes precedence over pygea.ini.
|
|
Returns None if neither source provides a value."""
|
|
env_key = os.environ.get("PYGEA_API_KEY")
|
|
if env_key:
|
|
return env_key
|
|
|
|
config = ConfigParser()
|
|
config.read("pygea.ini")
|
|
try:
|
|
return config.get("runtime", "api_key")
|
|
except (NoSectionError, NoOptionError):
|
|
return None
|
|
|
|
|
|
def get_configuration_variable(section, vname):
|
|
"""Retrieve values from the configuration file."""
|
|
config = ConfigParser()
|
|
config.read("pygea.ini")
|
|
|
|
value = config.get(section, vname)
|
|
if (value == "True") | (value == "False"):
|
|
value = make_boolean(value)
|
|
|
|
return value
|
|
|
|
|
|
def is_domain_name(domain):
|
|
"""Does the provided string resemble a domain name?"""
|
|
if any(char in domain for char in "."):
|
|
return True
|
|
return False
|
|
|
|
|
|
def hash_site_metadata(metadata):
|
|
"""Create a secure hash of website HTTP meta headers to use as an RSS/ATOM ID."""
|
|
sh = hashlib.sha256()
|
|
for key in metadata.keys():
|
|
sh.update(key.encode("utf8") + metadata[key].encode("utf8"))
|
|
|
|
digest = sh.hexdigest()
|
|
return digest
|
|
|
|
|
|
def rss_namespace_supported(prop):
|
|
"""Determine if a provided RSS/XML namespace is valid in the FeedGen RSS package."""
|
|
supported_namespaces = [
|
|
"dc",
|
|
"geo",
|
|
"gen_entry",
|
|
"media",
|
|
"podcast",
|
|
"podcast_entry",
|
|
"syndication",
|
|
"torrent",
|
|
]
|
|
if prop in supported_namespaces:
|
|
return True
|
|
return False
|
|
|
|
|
|
def rss_namespace_for_property(prop):
|
|
"""Returns the XML namespace for a specified <channel> or <item>
|
|
property from among a list of the most popular namespace schemes
|
|
according to:
|
|
https://www.rssboard.org/news/168/rss-channel-element-usage-stats
|
|
For an exhaustive list of namespace schemes see:
|
|
https://validator.w3.org/feed/docs/howto/declare_namespaces.html
|
|
"""
|
|
known_namespaces = {
|
|
"content": "http://purl.org/rss/1.0/modules/content/", # content
|
|
"dc": "http://purl.org/dc/elements/1.1/", # Dublin Core
|
|
"atom": "http://www.w3.org/2005/Atom", # ATOM
|
|
"sy": "http://purl.org/rss/1.0/modules/syndication/", # Syndication
|
|
"admin": "http://webns.net/mvcb/",
|
|
"feedburner": "http://rssnamespace.org/feedburner/ext/1.0", # Feedburner
|
|
"cc": "http://web.resource.org/cc/", # copyrights
|
|
"geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
|
|
"opensearch": "http://a9.com/-/spec/opensearch/1.1/", # OpenSearch
|
|
"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", # Apple iTunes
|
|
"blogChannel": "http://backend.userland.com/blogChannelModule", # BlogChannel
|
|
"media": "http://search.yahoo.com/mrss/", # media RSS
|
|
"icbm": "http://postneo.com/icbm", # ICBM
|
|
"cf": "http://www.microsoft.com/schemas/rss/core/2005", # a Microsoft thing
|
|
"podcast": "https://podcastindex.org/namespace/1.0", # Podcast RSS
|
|
"xhtml": "http://www.w3.org/1999/xhtml", # XHTML
|
|
}
|
|
|
|
components = prop.split(":")
|
|
if known_namespaces.get(components[0]):
|
|
return known_namespaces[components[0]]
|
|
|
|
return None
|