pygea/pygea/utilities.py

# pylint: disable-msg=C0201
"""
- * -
Utilities for the Pangea CMS Service API

- * -
"""
import hashlib
import os
from configparser import ConfigParser, NoOptionError, NoSectionError
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup


def acquire(url):
    """Simple wrapper over the request object."""
    response = requests.get(url, timeout=20)

    # Check if the request was successful
    if response.status_code == 200:
        content = response.text
    else:
        print(
            "Failed to retrieve the web page. Status code: " + str(response.status_code)
        )
        return None

    return content


def parse_url_elements(url):
    """URL hackery - returns domain and Pangea article ID from a provided URL"""
    out = {}

    parts = urlparse(url)
    out["domain"] = parts.hostname

    # article ID is the file name at the end of the path ('324534.html')
    more_parts = parts.path.split("/")
    file = more_parts[len(more_parts) - 1]
    file_parts = file.split(".")
    out["article_id"] = file_parts[0]

    return out


def get_webpage_metadata(page_url):
    """Get HTML metadata elements from a webpage."""
    parsed = urlparse(page_url)
    domain = parsed.netloc
    #
    # USAGM websites support the OpenGraph tags which provide most
    # of the metadata we require.
    #
    html_content = acquire(page_url)
    if html_content == None:
        return None

    soup = BeautifulSoup(html_content, "html.parser")
    meta_tags = soup.find_all("meta")

    metadata = {}
    for tag in meta_tags:
        if "name" in tag.attrs:
            name = tag.attrs["name"]
            content = tag.attrs.get("content", "")
            metadata[name] = content
        elif "property" in tag.attrs:  # For OpenGraph metadata
            prop = tag.attrs["property"]
            content = tag.attrs.get("content", "")
            metadata[prop] = content

    # add useful language property
    html = soup.find_all("html")
    metadata["language"] = html[0]["lang"]

    # add links
    link_tags = soup.find_all("link")
    for tag in link_tags:
        if "rel" in tag.attrs:
            # print(json.dumps(tag.attrs, indent=4))
            if "alternate" in tag.attrs["rel"]:
                if "icon" in tag.attrs["rel"]:
                    metadata["favicon"] = "https://" + domain + tag.attrs.get("href")
            if tag.attrs["rel"][0] == "canonical":
                metadata["canonical"] = tag.attrs.get("href")

    return metadata


def get_media_metadata(image_url):
    """Get metadata for media content from website (via response headers)."""
    response = requests.head(image_url, timeout=20)
    meta = None
    if response.status_code == 200:
        meta = {
            "content_type": response.headers["Content-Type"],
            "content_length": response.headers["Content-Length"],
        }

    return meta


def make_boolean(bool_str):
    """Convert a boolean string to an actual Boolean."""
    in_str = bool_str.lower()
    if (in_str != "true") & (in_str != "false"):
        return True  # following Python conventions

    if in_str == "true":
        return True

    return False


def get_api_key():
    """Return the API key. PYGEA_API_KEY env var takes precedence over pygea.ini.
    Returns None if neither source provides a value."""
    env_key = os.environ.get("PYGEA_API_KEY")
    if env_key:
        return env_key

    config = ConfigParser()
    config.read("pygea.ini")
    try:
        return config.get("runtime", "api_key")
    except (NoSectionError, NoOptionError):
        return None


def get_configuration_variable(section, vname):
    """Retrieve values from the configuration file."""
    config = ConfigParser()
    config.read("pygea.ini")

    value = config.get(section, vname)
    if (value == "True") | (value == "False"):
        value = make_boolean(value)

    return value


def is_domain_name(domain):
    """Does the provided string resemble a domain name?"""
    if any(char in domain for char in "."):
        return True
    return False


def hash_site_metadata(metadata):
    """Create a secure hash of website HTTP meta headers to use as an RSS/ATOM ID."""
    sh = hashlib.sha256()
    for key in metadata.keys():
        sh.update(key.encode("utf8") + metadata[key].encode("utf8"))

    digest = sh.hexdigest()
    return digest


def rss_namespace_supported(prop):
    """Determine if a provided RSS/XML namespace is valid in the FeedGen RSS package."""
    supported_namespaces = [
        "dc",
        "geo",
        "gen_entry",
        "media",
        "podcast",
        "podcast_entry",
        "syndication",
        "torrent",
    ]
    if prop in supported_namespaces:
        return True
    return False


def rss_namespace_for_property(prop):
    """Returns the XML namespace for a specified <channel> or <item>
    property from among a list of the most popular namespace schemes
    according to:
        https://www.rssboard.org/news/168/rss-channel-element-usage-stats
    For an exhaustive list of namespace schemes see:
        https://validator.w3.org/feed/docs/howto/declare_namespaces.html
    """
    known_namespaces = {
        "content": "http://purl.org/rss/1.0/modules/content/",  # content
        "dc": "http://purl.org/dc/elements/1.1/",  # Dublin Core
        "atom": "http://www.w3.org/2005/Atom",  # ATOM
        "sy": "http://purl.org/rss/1.0/modules/syndication/",  # Syndication
        "admin": "http://webns.net/mvcb/",
        "feedburner": "http://rssnamespace.org/feedburner/ext/1.0",  # Feedburner
        "cc": "http://web.resource.org/cc/",  # copyrights
        "geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
        "opensearch": "http://a9.com/-/spec/opensearch/1.1/",  # OpenSearch
        "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",  # Apple iTunes
        "blogChannel": "http://backend.userland.com/blogChannelModule",  # BlogChannel
        "media": "http://search.yahoo.com/mrss/",  # media RSS
        "icbm": "http://postneo.com/icbm",  # ICBM
        "cf": "http://www.microsoft.com/schemas/rss/core/2005",  # a Microsoft thing
        "podcast": "https://podcastindex.org/namespace/1.0",  # Podcast RSS
        "xhtml": "http://www.w3.org/1999/xhtml",  # XHTML
    }

    components = prop.split(":")
    if known_namespaces.get(components[0]):
        return known_namespaces[components[0]]

    return None