This commit is contained in:
Abel Luck 2026-03-29 13:48:30 +02:00
parent 81bb8afc41
commit 98dcea4d7e
10 changed files with 811 additions and 478 deletions

46
flake.lock generated Normal file
View file

@ -0,0 +1,46 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1774386573,
"narHash": "sha256-4hAV26quOxdC6iyG7kYaZcM3VOskcPUrdCQd/nx8obc=",
"rev": "46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9",
"revCount": 969196,
"type": "tarball",
"url": "https://api.flakehub.com/f/pinned/NixOS/nixpkgs/0.1.969196%2Brev-46db2e09e1d3f113a13c0d7b81e2f221c63b8ce9/019d279e-af65-79ce-92be-5dee7b1e36d4/source.tar.gz"
},
"original": {
"type": "tarball",
"url": "https://flakehub.com/f/NixOS/nixpkgs/0.1"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs",
"treefmt-nix": "treefmt-nix"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1773297127,
"narHash": "sha256-6E/yhXP7Oy/NbXtf1ktzmU8SdVqJQ09HC/48ebEGBpk=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "71b125cd05fbfd78cab3e070b73544abe24c5016",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

138
flake.nix Normal file
View file

@ -0,0 +1,138 @@
{
description = "pygea - Pangea RSS feed generator";
inputs = {
nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/0.1";
treefmt-nix = {
url = "github:numtide/treefmt-nix";
inputs.nixpkgs.follows = "nixpkgs";
};
};
outputs =
{
self,
nixpkgs,
treefmt-nix,
...
}:
let
systems = [ "x86_64-linux" ];
forAllSystems =
fn:
nixpkgs.lib.genAttrs systems (
system:
fn (
import nixpkgs {
inherit system;
config.allowUnfree = true;
}
)
);
mkTreefmtConfig = pkgs: (treefmt-nix.lib.evalModule pkgs ./treefmt.nix).config;
in
{
formatter = forAllSystems (pkgs: (mkTreefmtConfig pkgs).build.wrapper);
packages = forAllSystems (
pkgs:
let
pkg = pkgs.callPackage ./nix/packages/pygea/package.nix { };
in
{
pygea = pkg;
default = pkg;
}
);
apps = forAllSystems (
pkgs:
let
package = self.packages.${pkgs.stdenv.hostPlatform.system}.default;
in
{
pygea = {
type = "app";
program = "${package}/bin/pygea";
meta.description = "pygea runtime";
};
default = {
type = "app";
program = "${package}/bin/pygea";
meta.description = "pygea runtime";
};
}
);
checks = forAllSystems (
pkgs:
let
inherit (pkgs.stdenv.hostPlatform) system;
exportedPackage = self.packages.${system}.default;
treefmtConfig = mkTreefmtConfig pkgs;
smokePython = pkgs.python313.withPackages (ps: [
ps.requests
ps.beautifulsoup4
ps.feedgen
ps."python-dateutil"
]);
smokeCheck = pkgs.runCommand "pygea-smoke" { nativeBuildInputs = [ smokePython ]; } ''
export PYTHONPATH="${exportedPackage}/${pkgs.python313.sitePackages}:$PYTHONPATH"
python - <<'PY'
from pathlib import Path
for source_file in Path("${./.}/pygea").glob("*.py"):
compile(source_file.read_text(encoding="utf-8"), str(source_file), "exec")
PY
python -c "import pygea; import pygea.utilities; import pygea.pexception"
mkdir -p "$out"
touch "$out/passed"
'';
deadnixCheck = pkgs.runCommand "pygea-deadnix" { nativeBuildInputs = [ pkgs.deadnix ]; } ''
cd ${./.}
deadnix --fail .
mkdir -p "$out"
touch "$out/passed"
'';
statixCheck = pkgs.runCommand "pygea-statix" { nativeBuildInputs = [ pkgs.statix ]; } ''
cd ${./.}
statix check
mkdir -p "$out"
touch "$out/passed"
'';
in
{
devshell-default = self.devShells.${system}.default;
formatter = treefmtConfig.build.wrapper;
package-default = exportedPackage;
treefmt = treefmtConfig.build.check ./.;
smoke = smokeCheck;
deadnix = deadnixCheck;
statix = statixCheck;
}
);
devShells = forAllSystems (
pkgs:
let
treefmtConfig = mkTreefmtConfig pkgs;
in
{
default = pkgs.mkShell {
packages = [
pkgs.python313
pkgs.uv
self.packages.${pkgs.stdenv.hostPlatform.system}.default
treefmtConfig.build.wrapper
pkgs.deadnix
pkgs.statix
];
};
}
);
};
}

View file

@ -0,0 +1,36 @@
{
lib,
python3Packages,
}:
python3Packages.buildPythonApplication {
pname = "pygea";
version = "0.1.0";
pyproject = true;
src = lib.cleanSource ../../..;
build-system = [
python3Packages.hatchling
];
dependencies = [
python3Packages.requests
python3Packages.beautifulsoup4
python3Packages.feedgen
python3Packages.python-dateutil
];
pythonImportsCheck = [
"pygea"
"pygea.utilities"
"pygea.pexception"
];
meta = {
description = "Pangea RSS feed generator";
homepage = "https://gitlab.com/guardianproject-ops/pygea";
license = lib.licenses.lgpl3Plus;
mainProgram = "pygea";
};
}

View file

@ -1,16 +1,16 @@
"""Pygea main entry point""" """Pygea main entry point"""
import hashlib import hashlib
import json import json
import os import os
from pygea import utilities
from pygea.pangeafeed import PangeaFeed from pygea.pangeafeed import PangeaFeed
from pygea.pexception import PangeaServiceException from pygea.pexception import PangeaServiceException
from pygea import utilities
OUTPUT_TO_FILE = utilities.get_configuration_variable("results", "output_to_file_p")
OUTPUT_TO_FILE = utilities.get_configuration_variable('results', 'output_to_file_p') OUTPUT_FILE_NAME = utilities.get_configuration_variable("results", "output_file_name")
OUTPUT_FILE_NAME = utilities.get_configuration_variable('results', 'output_file_name') OUTPUT_DIRECTORY = utilities.get_configuration_variable("results", "output_directory")
OUTPUT_DIRECTORY = utilities.get_configuration_variable('results', 'output_directory')
def write_manifest(categories): def write_manifest(categories):
@ -22,61 +22,71 @@ def write_manifest(categories):
if not os.path.exists(output_directory): if not os.path.exists(output_directory):
os.makedirs(output_directory) os.makedirs(output_directory)
manifest_path = os.path.join(output_directory, 'manifest.json') manifest_path = os.path.join(output_directory, "manifest.json")
with open(manifest_path, 'w', encoding='utf-8') as mfile: with open(manifest_path, "w", encoding="utf-8") as mfile:
json.dump({'categories': categories}, mfile, indent=2, ensure_ascii=False) json.dump({"categories": categories}, mfile, indent=2, ensure_ascii=False)
mfile.write('\n') mfile.write("\n")
def main(): def main():
# Feeds are generated for a single, specified, domain # Feeds are generated for a single, specified, domain
domain = 'www.martinoticias.com' domain = "www.martinoticias.com"
args = { args = {
# tuple values: # tuple values:
# [0] category name or a string representing a content query # [0] category name or a string representing a content query
# [1] only the newest content desired (as configured in pygea.ini)? # [1] only the newest content desired (as configured in pygea.ini)?
# [2] special content_type for this category only (from the approved list of types) # [2] special content_type for this category only (from the approved list of types)
'categories': [ "categories": [
('Titulares',True, None), ("Titulares", True, None),
('Cuba', True, None), ("Cuba", True, None),
('América Latina', True, None), ("América Latina", True, None),
('Info Martí ', False, None), # YES! this category name has a space character at the end! (
('Noticiero Martí Noticias', True, None) "Info Martí ",
False,
None,
), # YES! this category name has a space character at the end!
("Noticiero Martí Noticias", True, None),
], ],
'default_content_type': "articles" "default_content_type": "articles",
} }
# TWO OPTIONS from the args defined above: # TWO OPTIONS from the args defined above:
# 1. Generate a single feed from the defined categories # 1. Generate a single feed from the defined categories
#try: # try:
# pf = PangeaFeed(domain, args) # pf = PangeaFeed(domain, args)
# pf.acquire_content() # pf.acquire_content()
# pf.generate_feed() # pf.generate_feed()
# pf.disgorge() # pf.disgorge()
#except PangeaServiceException as error: # except PangeaServiceException as error:
# print(error) # print(error)
# 2. Generate different feeds for each defined category # 2. Generate different feeds for each defined category
try: try:
manifest_categories = [] manifest_categories = []
for cat_tuple in args['categories']: for cat_tuple in args["categories"]:
# form new args for each category/query # form new args for each category/query
newargs = { newargs = {"categories": [cat_tuple], "default_content_type": "articles"}
'categories': [cat_tuple],
'default_content_type': "articles"
}
pf = PangeaFeed(domain, newargs) pf = PangeaFeed(domain, newargs)
pf.acquire_content() pf.acquire_content()
pf.generate_feed() pf.generate_feed()
# put each feed into a different sub-directory # put each feed into a different sub-directory
feed_subdir = hashlib.md5(cat_tuple[0].encode('utf-8')).hexdigest()[:7] feed_subdir = hashlib.md5(cat_tuple[0].encode("utf-8")).hexdigest()[:7]
pf.disgorge(feed_subdir) pf.disgorge(feed_subdir)
manifest_categories.append({ manifest_categories.append(
'name': cat_tuple[0], {
'short-hash': feed_subdir, "name": cat_tuple[0],
'local-path': os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(os.sep, '/') "short-hash": feed_subdir,
}) "local-path": os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(
print("feed for {} output to sub-directory {}".format(cat_tuple[0], feed_subdir)) os.sep, "/"
),
}
)
print(
"feed for {} output to sub-directory {}".format(
cat_tuple[0], feed_subdir
)
)
write_manifest(manifest_categories) write_manifest(manifest_categories)
except PangeaServiceException as error: except PangeaServiceException as error:
print(error) print(error)

View file

@ -1,30 +1,31 @@
# pylint: disable-msg=C0103 # pylint: disable-msg=C0103
# pylint: disable-msg=C0201 # pylint: disable-msg=C0201
""" """
- * - - * -
Generate a custom RSS feed from Pangea, for a specific domain, with one or more Generate a custom RSS feed from Pangea, for a specific domain, with one or more
categories or content filters and an optional supplied content-type. categories or content filters and an optional supplied content-type.
- * - - * -
""" """
import os import os
import sys import sys
from datetime import datetime from datetime import datetime
from feedgen.feed import FeedGenerator from feedgen.feed import FeedGenerator
from pygea import pangeaservice
from pygea import pexception
from pygea import utilities
VERBOSE = utilities.get_configuration_variable('runtime', 'verbose_p') from pygea import pangeaservice, pexception, utilities
OUTPUT_TO_FILE = utilities.get_configuration_variable('results', 'output_to_file_p')
OUTPUT_FILE_NAME = utilities.get_configuration_variable('results', 'output_file_name')
OUTPUT_DIRECTORY = utilities.get_configuration_variable('results', 'output_directory')
class PangeaFeed(): VERBOSE = utilities.get_configuration_variable("runtime", "verbose_p")
OUTPUT_TO_FILE = utilities.get_configuration_variable("results", "output_to_file_p")
OUTPUT_FILE_NAME = utilities.get_configuration_variable("results", "output_file_name")
OUTPUT_DIRECTORY = utilities.get_configuration_variable("results", "output_directory")
class PangeaFeed:
_domain = None _domain = None
_categories = None _categories = None
_content_type = 'articles' # default _content_type = "articles" # default
def __init__(self, domain, kw_args): def __init__(self, domain, kw_args):
try: try:
@ -33,45 +34,56 @@ class PangeaFeed():
raise error raise error
self._domain = domain self._domain = domain
if kw_args.get('categories'): if kw_args.get("categories"):
self._categories = kw_args['categories'] self._categories = kw_args["categories"]
else: else:
raise pexception.PangeaServiceException("ERROR: At least one category or content-query is required") raise pexception.PangeaServiceException(
"ERROR: At least one category or content-query is required"
if kw_args.get('default_content_type'): )
if kw_args['default_content_type'] not in self._ps.content_types():
raise pexception.PangeaServiceException("{} is not a valid content type".format(kw_args['content_type']))
self._content_type = kw_args['default_content_type']
if kw_args.get("default_content_type"):
if kw_args["default_content_type"] not in self._ps.content_types():
raise pexception.PangeaServiceException(
"{} is not a valid content type".format(kw_args["content_type"])
)
self._content_type = kw_args["default_content_type"]
def acquire_content(self): def acquire_content(self):
self._full_article_list = [] self._full_article_list = []
for (cat, old, type) in self._categories: for cat, old, type in self._categories:
opt_args = {} opt_args = {}
# special type for this category? # special type for this category?
if type is None: if type is None:
type = self._content_type type = self._content_type
# wants old stuff (not configured date limit)? # wants old stuff (not configured date limit)?
if old is not None: if old is not None:
opt_args['daycount'] = 365 # oldest date = one year opt_args["daycount"] = 365 # oldest date = one year
opt_args['filter_date'] = False opt_args["filter_date"] = False
ci = self._ps.category_info(cat) ci = self._ps.category_info(cat)
if ci is not None: if ci is not None:
# cat is pre-defined category # cat is pre-defined category
opt_args['zoneid'] = ci['id'] opt_args["zoneid"] = ci["id"]
jbody = self._ps.get_content(type, opt_args) jbody = self._ps.get_content(type, opt_args)
else: else:
# cat as actually a free-form query string to be used no article content # cat as actually a free-form query string to be used no article content
jbody = self._ps.query_content(cat, opt_args) jbody = self._ps.query_content(cat, opt_args)
if len(jbody) == 0: if len(jbody) == 0:
if VERBOSE: if VERBOSE:
print("no articles available for {} [command: {}] [category/query: '{}'])".format(self._domain, self._content_type, cat)) print(
"no articles available for {} [command: {}] [category/query: '{}'])".format(
self._domain, self._content_type, cat
)
)
continue continue
if VERBOSE: if VERBOSE:
print ("{} articles added from category/query '{}'".format(str(len(jbody)), cat)) print(
"{} articles added from category/query '{}'".format(
str(len(jbody)), cat
)
)
for art in jbody: for art in jbody:
self._full_article_list.append(art) self._full_article_list.append(art)
@ -81,7 +93,7 @@ class PangeaFeed():
# Get preparatory information from the domain's homepage. Most characteristics # Get preparatory information from the domain's homepage. Most characteristics
# of the RSS Channel information are acquired from the homepage metadata. # of the RSS Channel information are acquired from the homepage metadata.
# #
md = utilities.get_webpage_metadata('https://' + self._domain) md = utilities.get_webpage_metadata("https://" + self._domain)
fg = FeedGenerator() fg = FeedGenerator()
self._fg = fg self._fg = fg
@ -90,31 +102,38 @@ class PangeaFeed():
# build the RSS <channel> element # build the RSS <channel> element
# #
fg.id(utilities.hash_site_metadata(md)) fg.id(utilities.hash_site_metadata(md))
fg.title(self._content_type + ' from ' + md['og:site_name']) fg.title(self._content_type + " from " + md["og:site_name"])
fg.link(href=md['og:url'], rel='alternate') fg.link(href=md["og:url"], rel="alternate")
fg.description(self._content_type + ' from ' + self._domain + " (" + md['description'] + ")") fg.description(
self._content_type
+ " from "
+ self._domain
+ " ("
+ md["description"]
+ ")"
)
# #
# NOTE: the parameters required for <image> in the <channel> are different # NOTE: the parameters required for <image> in the <channel> are different
# from <image> in an <item> # from <image> in an <item>
fg.image(url=md['og:image'], title=md['og:site_name'], link=md['og:url']) fg.image(url=md["og:image"], title=md["og:site_name"], link=md["og:url"])
# #
# Multiple categories/keywords are allowed in the RSS Channel # Multiple categories/keywords are allowed in the RSS Channel
keywords = md['keywords'] keywords = md["keywords"]
categories = keywords.split(',') categories = keywords.split(",")
sch = 'https://' + self._domain + '/' sch = "https://" + self._domain + "/"
for name in categories: for name in categories:
fg.category(term=name, scheme=sch, label=name) fg.category(term=name, scheme=sch, label=name)
fg.language(md['language']) fg.language(md["language"])
fg.generator('Guardian Project Pangea CMS Crawler 1.0') fg.generator("Guardian Project Pangea CMS Crawler 1.0")
fg.webMaster('support@guardianproject.info') fg.webMaster("support@guardianproject.info")
fg.ttl(60) fg.ttl(60)
datetime_obj = datetime.now() datetime_obj = datetime.now()
formatted_time = datetime_obj.strftime('%a, %d %b %Y %H:%M:%S %Z') formatted_time = datetime_obj.strftime("%a, %d %b %Y %H:%M:%S %Z")
fg.lastBuildDate(formatted_time + '+0000') fg.lastBuildDate(formatted_time + "+0000")
# #
# Build the <item> elements for each <item> and add each item to the RSS Channel # Build the <item> elements for each <item> and add each item to the RSS Channel
@ -122,71 +141,78 @@ class PangeaFeed():
media_extension_loaded = False media_extension_loaded = False
for article in self._full_article_list: for article in self._full_article_list:
try: try:
article_deets = self._ps.get_article_detail(article['id']) article_deets = self._ps.get_article_detail(article["id"])
rss_article = self._ps.rss_article_from_pangea_article(article_deets) rss_article = self._ps.rss_article_from_pangea_article(article_deets)
except pexception.PangeaServiceException as error: except pexception.PangeaServiceException as error:
if VERBOSE: if VERBOSE:
print(error) print(error)
print("article with id [{}] may no longer exist in Pangea".format(str(article['id']))) print(
"article with id [{}] may no longer exist in Pangea".format(
str(article["id"])
)
)
continue continue
fe = fg.add_entry() fe = fg.add_entry()
fe.title(rss_article['title']) fe.title(rss_article["title"])
fe.link({'href': rss_article['link']}) fe.link({"href": rss_article["link"]})
fe.guid(rss_article['guid']) fe.guid(rss_article["guid"])
fe.pubDate(rss_article['pubDate']) fe.pubDate(rss_article["pubDate"])
fe.content(rss_article['content']) fe.content(rss_article["content"])
if rss_article.get('summary'): if rss_article.get("summary"):
fe.description(rss_article['summary']) fe.description(rss_article["summary"])
if rss_article.get('enclosure'): if rss_article.get("enclosure"):
enc_md = rss_article['enclosure'] enc_md = rss_article["enclosure"]
if enc_md.get('type'): if enc_md.get("type"):
fe.enclosure( fe.enclosure(
url=enc_md['url'], url=enc_md["url"], type=enc_md["type"], length=enc_md["length"]
type=enc_md['type'], )
length=enc_md['length'])
else: else:
fe.enclosure(url=enc_md['url']) fe.enclosure(url=enc_md["url"])
if rss_article.get('media_content'): if rss_article.get("media_content"):
# #
# special handling for the RSS media extension # special handling for the RSS media extension
# #
if not media_extension_loaded: if not media_extension_loaded:
fg.load_extension('media') fg.load_extension("media")
media_extension_loaded = True media_extension_loaded = True
if VERBOSE: print("media extension loaded") if VERBOSE:
print("media extension loaded")
mc_md = rss_article['media_content'] mc_md = rss_article["media_content"]
if mc_md.get('medium'): if mc_md.get("medium"):
fe.media.content( fe.media.content(
url=mc_md['url'], url=mc_md["url"],
type=mc_md['type'], type=mc_md["type"],
fileSize=mc_md['fileSize'], fileSize=mc_md["fileSize"],
medium=mc_md['medium']) medium=mc_md["medium"],
)
else: else:
fe.media.content(url=mc_md['url']) fe.media.content(url=mc_md["url"])
def disgorge(self, subdirectory=None):
def disgorge(self, subdirectory = None):
# #
# Output the RSS feed as appropriate # Output the RSS feed as appropriate
# #
if OUTPUT_TO_FILE is True: if OUTPUT_TO_FILE is True:
try: try:
if subdirectory is not None: if subdirectory is not None:
if not os.path.exists(OUTPUT_DIRECTORY + '/' + subdirectory): if not os.path.exists(OUTPUT_DIRECTORY + "/" + subdirectory):
os.makedirs(OUTPUT_DIRECTORY + '/' + subdirectory) os.makedirs(OUTPUT_DIRECTORY + "/" + subdirectory)
ofile = OUTPUT_DIRECTORY + '/' + subdirectory + '/' + OUTPUT_FILE_NAME ofile = (
OUTPUT_DIRECTORY + "/" + subdirectory + "/" + OUTPUT_FILE_NAME
)
else: else:
if not os.path.exists(OUTPUT_DIRECTORY): if not os.path.exists(OUTPUT_DIRECTORY):
os.makedirs(OUTPUT_DIRECTORY) os.makedirs(OUTPUT_DIRECTORY)
ofile = OUTPUT_DIRECTORY + '/' + OUTPUT_FILE_NAME ofile = OUTPUT_DIRECTORY + "/" + OUTPUT_FILE_NAME
self._fg.rss_file(ofile, extensions=True, pretty=True) self._fg.rss_file(ofile, extensions=True, pretty=True)
except OSError as fe: except OSError as fe:
print("for {} file error: ".format(ofile, str(fe))) print("for {} file error: ".format(ofile, str(fe)))
sys.exit(1) sys.exit(1)
if VERBOSE: print("output written to {}".format(ofile)) if VERBOSE:
print("output written to {}".format(ofile))
else: else:
print(self._fg.rss_str(extensions=True, pretty=True)) print(self._fg.rss_str(extensions=True, pretty=True))

View file

@ -1,40 +1,41 @@
""" """
- * - - * -
Interface to USAGM Pangea Content Management System API Interface to USAGM Pangea Content Management System API
This implementation is a subset of API functions, focusing on the eventual This implementation is a subset of API functions, focusing on the eventual
creation of RSS (or other) data streams from article selections creation of RSS (or other) data streams from article selections
Pangea Documentation: Pangea Documentation:
https://showcase.pangea-cms.com/a/pangea-api-methods-and-models/29663096.html https://showcase.pangea-cms.com/a/pangea-api-methods-and-models/29663096.html
:copyright: 2024, David Oliver <david@guardianproject.info> :copyright: 2024, David Oliver <david@guardianproject.info>
:license: http://www.gnu.org/copyleft/lesser.html GNU Lesser General Public License :license: http://www.gnu.org/copyleft/lesser.html GNU Lesser General Public License
- * - - * -
""" """
import hashlib
import json import json
import re import re
import hashlib
import urllib.parse import urllib.parse
from datetime import datetime, timezone, timedelta from datetime import datetime, timedelta, timezone
import requests import requests
from dateutil.parser import * from dateutil.parser import *
from pygea import utilities
from pygea import pexception from pygea import pexception, plogger, utilities
from pygea import plogger
class PangeaService: class PangeaService:
""" Interface to the Pangea API """ """Interface to the Pangea API"""
_configuration_file_name = 'pygea.ini' _configuration_file_name = "pygea.ini"
_api_path = '/api2/' _api_path = "/api2/"
_api_key = None _api_key = None
# Pangea and RSS time format # Pangea and RSS time format
TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f" # ex. 2024-08-02T11:46:28.673 TIME_FMT = "%Y-%m-%dT%H:%M:%S.%f" # ex. 2024-08-02T11:46:28.673
TIME_FMT_I = "%Y-%m-%dT%H:%M:%S" # ex. 2024-08-02T11:46:28 TIME_FMT_I = "%Y-%m-%dT%H:%M:%S" # ex. 2024-08-02T11:46:28
RFC822_FMT = "%a, %d %B %Y %H:%M:%S %z" RFC822_FMT = "%a, %d %B %Y %H:%M:%S %z"
# API commands - commands commented out are valid in the API but NOT SUPPORTED HERE # API commands - commands commented out are valid in the API but NOT SUPPORTED HERE
@ -42,72 +43,71 @@ class PangeaService:
"articledetail", "articledetail",
"articles", "articles",
"audioclips", "audioclips",
#"audioscheduler", # "audioscheduler",
"author", "author",
#"blogitem", # "blogitem",
"breakingnews", "breakingnews",
#"comment", # "comment",
"config", "config",
#"documentdetail", # "documentdetail",
"empty", "empty",
#"factcheckdetail", # "factcheckdetail",
#"htmlwidget", # "htmlwidget",
#"infographicdetail", # "infographicdetail",
#"liveblogs", # "liveblogs",
#"livestream", # "livestream",
"mostpopular", "mostpopular",
#"polldetail", # "polldetail",
#"quizdetail", # "quizdetail",
"search", "search",
"test", "test",
"topstories", "topstories",
"videoclips", "videoclips",
#"videoscheduler", # "videoscheduler",
#"widget", # "widget",
"zone" "zone",
] ]
# Position-indexed content category names # Position-indexed content category names
_category_types_list = [ _category_types_list = [
'none', # 0 internally "none", # 0 internally
'content', # 1 internally "content", # 1 internally
'audio', # 2 internally "audio", # 2 internally
'content+audio', # 3 internally; compound type 1+2 "content+audio", # 3 internally; compound type 1+2
'media', # 4 internally "media", # 4 internally
'content+media', # 5 internally; compound type 1+4 "content+media", # 5 internally; compound type 1+4
'audio+media' # 6 internally; compound type 2+4 "audio+media", # 6 internally; compound type 2+4
] ]
# Content types (in the editorial sense) # Content types (in the editorial sense)
# Note these also map to commands in _commands_list # Note these also map to commands in _commands_list
_content_types_list = [ _content_types_list = [
'articles', "articles",
'audioclips', "audioclips",
'videoclips', "videoclips",
'breakingnews', "breakingnews",
'mostpopular', "mostpopular",
'topstories' "topstories",
] ]
# How to format content # How to format content
# (we WILL NOT use these in combination, as defined in the API) # (we WILL NOT use these in combination, as defined in the API)
_content_options = { _content_options = {
'WTF_0': 0, # Returns basically what is in database "WTF_0": 0, # Returns basically what is in database
'TEXT_ONLY': 1, # Removes all html keeping text only "TEXT_ONLY": 1, # Removes all html keeping text only
'WTF_1': 2, # Returns tags as they would be displayed on the page "WTF_1": 2, # Returns tags as they would be displayed on the page
'MOBILE_1': 4, # Returns html as for mobile/rss feeds without "MOBILE_1": 4, # Returns html as for mobile/rss feeds without
# additional stripping # additional stripping
'MOBILE_2': 8, # Returns html as for mobile/rss feeds with stripping "MOBILE_2": 8, # Returns html as for mobile/rss feeds with stripping
# some html that is not supported # some html that is not supported
'MOBILE_3': 16, # Returns html as for mobile/rss feeds with some extra "MOBILE_3": 16, # Returns html as for mobile/rss feeds with some extra
# html tags stripped # html tags stripped
'WTF_2': 32, # Same as for Feeds + replaces recognized links with "WTF_2": 32, # Same as for Feeds + replaces recognized links with
# internal links and wraps recognized images inside tags # internal links and wraps recognized images inside tags
'XML_TX': 64, # Used with Feeds to apply xsl transformation "XML_TX": 64, # Used with Feeds to apply xsl transformation
'JSON': 128 # Generates json structured content "JSON": 128, # Generates json structured content
} }
def __init__(self, domain, key=None, verbose=False): def __init__(self, domain, key=None, verbose=False):
self._logger = plogger.PangeaServiceLogger() self._logger = plogger.PangeaServiceLogger()
@ -123,13 +123,21 @@ class PangeaService:
# #
# preset from configuration file # preset from configuration file
# #
self._max_articles = int(utilities.get_configuration_variable('runtime', 'max_articles')) self._max_articles = int(
self._oldest_article = int(utilities.get_configuration_variable('runtime', 'oldest_article')) utilities.get_configuration_variable("runtime", "max_articles")
self._content_format = utilities.get_configuration_variable('runtime', 'content_format') )
self._authors_p = utilities.get_configuration_variable('runtime', 'authors_p') self._oldest_article = int(
self._no_media_p = utilities.get_configuration_variable('runtime', 'no_media_p') utilities.get_configuration_variable("runtime", "oldest_article")
self._content_inc_p = utilities.get_configuration_variable('runtime', 'content_inc_p') )
self._verbose_p = utilities.get_configuration_variable('runtime', 'verbose_p') self._content_format = utilities.get_configuration_variable(
"runtime", "content_format"
)
self._authors_p = utilities.get_configuration_variable("runtime", "authors_p")
self._no_media_p = utilities.get_configuration_variable("runtime", "no_media_p")
self._content_inc_p = utilities.get_configuration_variable(
"runtime", "content_inc_p"
)
self._verbose_p = utilities.get_configuration_variable("runtime", "verbose_p")
self._domain = domain self._domain = domain
@ -140,13 +148,13 @@ class PangeaService:
if verbose: if verbose:
self._verbose_p = verbose self._verbose_p = verbose
if self._verbose_p: if self._verbose_p:
print('verbose output') print("verbose output")
# #
# These two dictionaries index the category information # These two dictionaries index the category information
# _all_categories is indexed by category name; _rev_categories is indexed by id # _all_categories is indexed by category name; _rev_categories is indexed by id
# #
self._all_categories = { } self._all_categories = {}
self._rev_categories = {'0': 'none'} self._rev_categories = {"0": "none"}
# Acquire the categories registered for the supplied domain # Acquire the categories registered for the supplied domain
# Invokes an API call! # Invokes an API call!
@ -157,19 +165,18 @@ class PangeaService:
# #
def set_domain(self, value): def set_domain(self, value):
""" Sets the USAGM Internet domain name from which content is acquired """ """Sets the USAGM Internet domain name from which content is acquired"""
self._domain = value self._domain = value
# Reset the category dictionaries # Reset the category dictionaries
self._all_categories = { } self._all_categories = {}
self._rev_categories = {'0': 'none'} self._rev_categories = {"0": "none"}
# Acquire the categories registered for the supplied domain (API call) # Acquire the categories registered for the supplied domain (API call)
self.get_categories() self.get_categories()
def set_api_key(self, key): def set_api_key(self, key):
""" Sets the API key that allows access to the API """ """Sets the API key that allows access to the API"""
self._api_key = key self._api_key = key
# #
@ -177,50 +184,44 @@ class PangeaService:
# #
def content_types(self): def content_types(self):
""" Return full list of content types. """ """Return full list of content types."""
return self._content_types_list return self._content_types_list
def content_type_name(self, type_index): def content_type_name(self, type_index):
""" Returns name of a content type given its index. """ """Returns name of a content type given its index."""
if type_index > len(self._content_types_list): if type_index > len(self._content_types_list):
return False return False
return self._content_types_list[type_index] return self._content_types_list[type_index]
def commands(self): def commands(self):
""" Return the list of possible commands. """ """Return the list of possible commands."""
return self._commands_list return self._commands_list
def category_types(self): def category_types(self):
"""Return of list of possible category types. """ """Return of list of possible category types."""
return self._category_types_list return self._category_types_list
def category_info(self, category_name): def category_info(self, category_name):
""" Return rich information about a category. """ """Return rich information about a category."""
if self._all_categories.get(category_name): if self._all_categories.get(category_name):
return self._all_categories[category_name] return self._all_categories[category_name]
return None return None
def content_options(self): def content_options(self):
""" Return the dictionary of content format options. """ """Return the dictionary of content format options."""
return self._content_options return self._content_options
def is_valid_command(self, cmd): def is_valid_command(self, cmd):
""" Test if the provided command is valid and implemented. """ """Test if the provided command is valid and implemented."""
return self._is_implemented(cmd) return self._is_implemented(cmd)
def is_valid_category(self, category_name): def is_valid_category(self, category_name):
""" Test if a provided category name is valid. """Test if a provided category name is valid.
NOTE: Categories are unique on a per-domain basis, so they are retrieve NOTE: Categories are unique on a per-domain basis, so they are retrieve
via the API when this class is instantiated. There are no "generic" via the API when this class is instantiated. There are no "generic"
categories that apply to all domains. categories that apply to all domains.
.""" ."""
keys = self._all_categories.keys() keys = self._all_categories.keys()
if category_name in keys: if category_name in keys:
return True return True
@ -232,115 +233,122 @@ class PangeaService:
def rss_article_from_pangea_article(self, article): def rss_article_from_pangea_article(self, article):
""" """
Use this method to convert an API-returned articledetail definition Use this method to convert an API-returned articledetail definition
to an RSS-appropriate definition. to an RSS-appropriate definition.
This method succeeds using the bare article definition, but will be This method succeeds using the bare article definition, but will be
absent the content field and other descriptors. Enclosures are absent the content field and other descriptors. Enclosures are
returned, however. returned, however.
""" """
rss = { } rss = {}
sh = hashlib.sha256() sh = hashlib.sha256()
sh.update(article['url'].encode('utf8')) sh.update(article["url"].encode("utf8"))
rss['guid'] = sh.hexdigest() rss["guid"] = sh.hexdigest()
rss['title'] = article['title'] rss["title"] = article["title"]
rss['link'] = article['url'] rss["link"] = article["url"]
if article.get('introduction'): if article.get("introduction"):
rss['summary'] = article['introduction'] rss["summary"] = article["introduction"]
if article.get('authors'): if article.get("authors"):
as_str = '' as_str = ""
for auth in article['authors']: for auth in article["authors"]:
as_str += auth['lastname'] + ", " + auth['firstname'] + ";" as_str += auth["lastname"] + ", " + auth["firstname"] + ";"
if len(article['authors']) > 1: if len(article["authors"]) > 1:
as_str = as_str[0: (len(as_str) - 2)] as_str = as_str[0 : (len(as_str) - 2)]
rss['authors'] = as_str rss["authors"] = as_str
if article.get('image'): if article.get("image"):
# Seek the enclosure details from the image's server # Seek the enclosure details from the image's server
metadata = utilities.get_media_metadata(article['image']) metadata = utilities.get_media_metadata(article["image"])
if metadata: if metadata:
rss['enclosure'] = { rss["enclosure"] = {
'url': article['image'], "url": article["image"],
'type': metadata['content_type'], "type": metadata["content_type"],
'length': metadata['content_length'] "length": metadata["content_length"],
} }
else: else:
rss['enclosure'] = {'url': article['image']} rss["enclosure"] = {"url": article["image"]}
if rss.get('enclosure'): if rss.get("enclosure"):
if self._verbose_p: if self._verbose_p:
print( print(
"article contains an enclosure:\n" "article contains an enclosure:\n"
+ json.dumps(rss['enclosure'], indent=4)) + json.dumps(rss["enclosure"], indent=4)
)
# 'audioclips' and 'videoclips' occasionally have no text content # 'audioclips' and 'videoclips' occasionally have no text content
if article.get('content'): if article.get("content"):
rss['content'] = article['content'] rss["content"] = article["content"]
else: else:
rss['content'] = '' rss["content"] = ""
# all articles are required to have one category (their 'zone') # all articles are required to have one category (their 'zone')
if bool(article.get('zone')): if bool(article.get("zone")):
zone_id = article['zone'] zone_id = article["zone"]
if isinstance(zone_id, int): if isinstance(zone_id, int):
zone_id = str(zone_id) zone_id = str(zone_id)
if self._rev_categories.get('zone_id'): if self._rev_categories.get("zone_id"):
rss['categories'] = self._rev_categories[zone_id] rss["categories"] = self._rev_categories[zone_id]
elif article.get('zoneTitle'): elif article.get("zoneTitle"):
rss['categories'] = article['zoneTitle'] rss["categories"] = article["zoneTitle"]
# Pangea time is always in GMT # Pangea time is always in GMT
# Pangea time is formatted as: 2024-07-31T11:46:28.673 # Pangea time is formatted as: 2024-07-31T11:46:28.673
# (though occasionally: 2024-07-31T11:46:28) # (though occasionally: 2024-07-31T11:46:28)
# Convert to RSS time (RFC822) # Convert to RSS time (RFC822)
if not article.get('pubDate'): if not article.get("pubDate"):
datetime_obj = datetime.now(timezone.utc) datetime_obj = datetime.now(timezone.utc)
else: else:
if re.match('.*?([.][0-9]+)$', article['pubDate']): if re.match(".*?([.][0-9]+)$", article["pubDate"]):
datetime_obj = datetime.strptime(article['pubDate'], self.TIME_FMT) datetime_obj = datetime.strptime(article["pubDate"], self.TIME_FMT)
else: else:
datetime_obj = datetime.strptime(article['pubDate'], self.TIME_FMT_I) datetime_obj = datetime.strptime(article["pubDate"], self.TIME_FMT_I)
formatted_time = datetime_obj.strftime(self.RFC822_FMT) formatted_time = datetime_obj.strftime(self.RFC822_FMT)
rss['pubDate'] = formatted_time + '+0000' rss["pubDate"] = formatted_time + "+0000"
# Media types # Media types
if bool(article.get('videos')): if bool(article.get("videos")):
if len(article['videos']) > 0: if len(article["videos"]) > 0:
url = article['videos'][0]['url'] url = article["videos"][0]["url"]
metadata = utilities.get_media_metadata(url) metadata = utilities.get_media_metadata(url)
if metadata: if metadata:
rss['media_content'] = { rss["media_content"] = {
'url': url, "url": url,
'type': metadata['content_type'], "type": metadata["content_type"],
'fileSize': metadata['content_length'], "fileSize": metadata["content_length"],
'medium': 'video' "medium": "video",
} }
if self._verbose_p: print("article contains video media:\n" if self._verbose_p:
+ json.dumps(rss['media_content'], indent=4)) print(
"article contains video media:\n"
+ json.dumps(rss["media_content"], indent=4)
)
else: else:
rss['media_content'] = {'url': url} rss["media_content"] = {"url": url}
if bool(article.get('audios')): if bool(article.get("audios")):
if len(article['audios']) > 0: if len(article["audios"]) > 0:
url = article['audios'][0]['url'] url = article["audios"][0]["url"]
metadata = utilities.get_media_metadata(url) metadata = utilities.get_media_metadata(url)
if metadata: if metadata:
rss['media_content'] = { rss["media_content"] = {
'url': url, "url": url,
'type': metadata['content_type'], "type": metadata["content_type"],
'fileSize': metadata['content_length'], "fileSize": metadata["content_length"],
'medium': 'audio' "medium": "audio",
} }
if self._verbose_p: print("article contains audio media:\n" if self._verbose_p:
+ json.dumps(rss['media_content'], indent=4)) print(
"article contains audio media:\n"
+ json.dumps(rss["media_content"], indent=4)
)
else: else:
rss['media_content'] = {'url': url} rss["media_content"] = {"url": url}
return rss return rss
@ -349,29 +357,26 @@ class PangeaService:
# #
def test_pangea_interface(self): def test_pangea_interface(self):
""" TESTING Basic connectivity test """ """TESTING Basic connectivity test"""
return self._retrieve_content('test') return self._retrieve_content("test")
def empty(self): def empty(self):
""" """
TESTING Returns nothing but, if command formatted properly, with proper API TESTING Returns nothing but, if command formatted properly, with proper API
key, HTTP status will be 200 key, HTTP status will be 200
""" """
res = self._retrieve_content('empty') res = self._retrieve_content("empty")
return res return res
def config(self): def config(self):
""" TESTING Returns configuration information about the API """ """TESTING Returns configuration information about the API"""
return self._retrieve_content('config') return self._retrieve_content("config")
def get_content(self, content_type, optional_args_kw=None):
def get_content(self, content_type, optional_args_kw = None):
""" """
Use this method to get articles by content type, subset by a specific category Use this method to get articles by content type, subset by a specific category
as supplied. See API docs for additional API parameters that can be specified as supplied. See API docs for additional API parameters that can be specified
to reduce the volume of articles returned. to reduce the volume of articles returned.
""" """
try: try:
res = self._retrieve_content(content_type, optional_args_kw) res = self._retrieve_content(content_type, optional_args_kw)
@ -381,99 +386,99 @@ class PangeaService:
# because Pangea does not uniformly apply 'count' and 'daycount' parameters # because Pangea does not uniformly apply 'count' and 'daycount' parameters
# to all content generation, we'll do that here (unless we're told to ignore). # to all content generation, we'll do that here (unless we're told to ignore).
if optional_args_kw is not None: if optional_args_kw is not None:
if optional_args_kw.get('filter_date') is not None: if optional_args_kw.get("filter_date") is not None:
if optional_args_kw.get('filter_date') is False: if optional_args_kw.get("filter_date") is False:
return res return res
return self._threshold(res) return self._threshold(res)
def query_content(self, query, optional_args_kw=None):
def query_content(self, query, optional_args_kw = None):
""" """
Use this method to get articles based on textual search. Use this method to get articles based on textual search.
See API docs for additional API parameters that can be specified See API docs for additional API parameters that can be specified
to reduce the volume of articles returned. Alternatively, see to reduce the volume of articles returned. Alternatively, see
docs for the 'pageNumber' parameter to handling a search returning docs for the 'pageNumber' parameter to handling a search returning
many articles (only query/search supports this parameter). many articles (only query/search supports this parameter).
""" """
# make the topic/category URL-safe # make the topic/category URL-safe
if optional_args_kw is None: if optional_args_kw is None:
optional_args_kw = {} optional_args_kw = {}
optional_args_kw['q'] = urllib.parse.quote_plus(query) optional_args_kw["q"] = urllib.parse.quote_plus(query)
try: try:
res = self._retrieve_content('search', optional_args_kw) res = self._retrieve_content("search", optional_args_kw)
except pexception.PangeaServiceException as e: except pexception.PangeaServiceException as e:
raise pexception.PangeaServiceException(str(e)) from e raise pexception.PangeaServiceException(str(e)) from e
# #
# because Pangea does not uniformly apply 'count' and 'daycount' parameters # because Pangea does not uniformly apply 'count' and 'daycount' parameters
# to all content generation, we'll do that here (unless we're told to ignore). # to all content generation, we'll do that here (unless we're told to ignore).
if optional_args_kw.get('filter_date') is not None: if optional_args_kw.get("filter_date") is not None:
if optional_args_kw.get('filter_date') is False: if optional_args_kw.get("filter_date") is False:
return res return res
return self._threshold(res) return self._threshold(res)
def get_article(self, article_id, optional_args_kw = None): def get_article(self, article_id, optional_args_kw=None):
""" """
Use this method to get all the detail for a given article (typically Use this method to get all the detail for a given article (typically
required to do anything useful). required to do anything useful).
""" """
if optional_args_kw is None: if optional_args_kw is None:
optional_args_kw = {} optional_args_kw = {}
if 'MediaData' not in optional_args_kw.keys(): if "MediaData" not in optional_args_kw.keys():
optional_args_kw['MediaData'] = 'true' optional_args_kw["MediaData"] = "true"
optional_args_kw['itemid'] = article_id optional_args_kw["itemid"] = article_id
try: try:
res = self._retrieve_content('articles', optional_args_kw) res = self._retrieve_content("articles", optional_args_kw)
except pexception.PangeaServiceException as e: except pexception.PangeaServiceException as e:
raise pexception.PangeaServiceException(str(e)) from e raise pexception.PangeaServiceException(str(e)) from e
#print(json.dumps(res, indent=4)) # print(json.dumps(res, indent=4))
return res return res
def get_article_detail(self, article_id, optional_args_kw = None): def get_article_detail(self, article_id, optional_args_kw=None):
""" """
Use this method to get all the detail for a given article (typically Use this method to get all the detail for a given article (typically
required to do anything useful). required to do anything useful).
""" """
if optional_args_kw is None: if optional_args_kw is None:
optional_args_kw = {} optional_args_kw = {}
if 'Content' not in optional_args_kw.keys(): if "Content" not in optional_args_kw.keys():
optional_args_kw['Content'] = 'true' optional_args_kw["Content"] = "true"
if 'MediaData' not in optional_args_kw.keys(): if "MediaData" not in optional_args_kw.keys():
optional_args_kw['MediaData'] = 'true' optional_args_kw["MediaData"] = "true"
optional_args_kw['itemid'] = article_id optional_args_kw["itemid"] = article_id
try: try:
res = self._retrieve_content('articledetail', optional_args_kw) res = self._retrieve_content("articledetail", optional_args_kw)
except pexception.PangeaServiceException as e: except pexception.PangeaServiceException as e:
raise pexception.PangeaServiceException(str(e)) from e raise pexception.PangeaServiceException(str(e)) from e
#print(json.dumps(res, indent=4)) # print(json.dumps(res, indent=4))
return res return res
def get_categories(self, types=None):
def get_categories(self, types = None):
""" """
Categories are defined on a PER DOMAIN basis, so to assure the user Categories are defined on a PER DOMAIN basis, so to assure the user
provides a proper category name we need to acquire the full set of provides a proper category name we need to acquire the full set of
categories before we proceed with any queries. categories before we proceed with any queries.
""" """
if len(self._all_categories.keys()) > 0: if len(self._all_categories.keys()) > 0:
return self._all_categories return self._all_categories
if types is None: if types is None:
types = 'acm' # get all content types 'a', 'c', 'm' at once types = "acm" # get all content types 'a', 'c', 'm' at once
args = {'type': types} args = {"type": types}
try: try:
url = self._build_url('zone', args) url = self._build_url("zone", args)
response = requests.get(url, timeout=20) response = requests.get(url, timeout=20)
if response.status_code != 200: if response.status_code != 200:
msg = "HTP request to {} failed with status code [{}]".format(self._domain, str(response.status_code)) msg = "HTP request to {} failed with status code [{}]".format(
self._domain, str(response.status_code)
)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
a_cat = json.loads(response.text) a_cat = json.loads(response.text)
@ -482,12 +487,14 @@ class PangeaService:
all_keys = self._all_categories.keys() all_keys = self._all_categories.keys()
for c in a_cat: for c in a_cat:
if not c['name'] in all_keys: if not c["name"] in all_keys:
self._all_categories[c['name']] = c self._all_categories[c["name"]] = c
self._rev_categories[str(c['id'])] = c['name'] self._rev_categories[str(c["id"])] = c["name"]
if c['type'] >= len(self._category_types_list): if c["type"] >= len(self._category_types_list):
msg = "ERROR: unknown type: {} on id [{}], name: {}".format(c['type'], str(c['id']), c['name']) msg = "ERROR: unknown type: {} on id [{}], name: {}".format(
c["type"], str(c["id"]), c["name"]
)
self._logger.warning(msg) self._logger.warning(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
@ -498,38 +505,37 @@ class PangeaService:
# #
def _boolean_string(self, boolean_value): def _boolean_string(self, boolean_value):
""" Convert a boolean to a string for the API """ """Convert a boolean to a string for the API"""
if boolean_value is True: if boolean_value is True:
return 'true' return "true"
return 'false' return "false"
def _retrieve_content(self, command, args_kw=None):
def _retrieve_content(self, command, args_kw = None): """Minimalist content retriever"""
""" Minimalist content retriever """
url = self._build_url(command, args_kw) url = self._build_url(command, args_kw)
#print('request URL: ' + url) # print('request URL: ' + url)
response = requests.get(url, timeout=20) response = requests.get(url, timeout=20)
if response.status_code != 200: if response.status_code != 200:
msg = "received status code {} from {}".format(str(response.status_code), url) msg = "received status code {} from {}".format(
str(response.status_code), url
)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
if command == 'empty': if command == "empty":
return json.loads('[]') return json.loads("[]")
return json.loads(response.text) return json.loads(response.text)
def _is_implemented(self, cmd): def _is_implemented(self, cmd):
""" Test if a provided string references an actual command """ """Test if a provided string references an actual command"""
if cmd in self._commands_list: if cmd in self._commands_list:
return True return True
return False return False
def _threshold(self, articles): def _threshold(self, articles):
""" Assure article-count and oldest-article settings are obeyed. Turns out, """Assure article-count and oldest-article settings are obeyed. Turns out,
only a few API commands accept these arguments, though in general our only a few API commands accept these arguments, though in general our
usage of the API requires it to be consistent. usage of the API requires it to be consistent.
""" """
output = [] output = []
article_count = 0 article_count = 0
@ -539,35 +545,44 @@ class PangeaService:
# #
# pubDate may contain milliseconds, or not # pubDate may contain milliseconds, or not
# #
if re.match('.*?([.][0-9]+)$', blob['pubDate']): if re.match(".*?([.][0-9]+)$", blob["pubDate"]):
dt = datetime.strptime(blob['pubDate'], self.TIME_FMT) dt = datetime.strptime(blob["pubDate"], self.TIME_FMT)
dt.replace(microsecond=0) dt.replace(microsecond=0)
else: else:
dt = datetime.strptime(blob['pubDate'], self.TIME_FMT_I) dt = datetime.strptime(blob["pubDate"], self.TIME_FMT_I)
old_dt = datetime.now() - delta old_dt = datetime.now() - delta
if dt < old_dt: if dt < old_dt:
if self._verbose_p: if self._verbose_p:
print("article with ID {} is too old [{}]".format(str(blob['id']), dt.strftime(self.TIME_FMT_I))) print(
"article with ID {} is too old [{}]".format(
str(blob["id"]), dt.strftime(self.TIME_FMT_I)
)
)
else: else:
article_count += 1 article_count += 1
output.append(blob) output.append(blob)
if self._verbose_p & (len(output) < len(articles)): if self._verbose_p & (len(output) < len(articles)):
print("request returned {} articles; newest {} processed".format(str(len(articles)), str(len(output)))) print(
"request returned {} articles; newest {} processed".format(
str(len(articles)), str(len(output))
)
)
reordered = output[::-1] reordered = output[::-1]
return reordered return reordered
def _build_url(self, cmd, args_kw=None):
def _build_url(self, cmd, args_kw = None): """Construct a properly-formatted Pangea API URL"""
""" Construct a properly-formatted Pangea API URL """
if not self._is_implemented(cmd): if not self._is_implemented(cmd):
msg = "ERROR: command [{}] NOT IMPLEMENTED".format(cmd) msg = "ERROR: command [{}] NOT IMPLEMENTED".format(cmd)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
if not self._api_key: if not self._api_key:
msg = "ERROR: no API key supplied (check config file {})".format(self._configuration_file_name) msg = "ERROR: no API key supplied (check config file {})".format(
self._configuration_file_name
)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
@ -576,72 +591,99 @@ class PangeaService:
# this switch verifies (and/or completes) the argument array # this switch verifies (and/or completes) the argument array
match cmd: match cmd:
#simple commands # simple commands
case 'empty' | 'test': case "empty" | "test":
pass pass
# search # search
case 'search': case "search":
if 'q' not in args_kw.keys(): if "q" not in args_kw.keys():
msg = "ERROR: [{}] requires parameter 'q'".format(cmd) msg = "ERROR: [{}] requires parameter 'q'".format(cmd)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
if 'Authors' not in args_kw.keys(): if "Authors" not in args_kw.keys():
args_kw['Authors'] = self._boolean_string(self._authors_p) args_kw["Authors"] = self._boolean_string(self._authors_p)
if 'count' not in args_kw.keys(): if "count" not in args_kw.keys():
args_kw['count'] = self._max_articles args_kw["count"] = self._max_articles
if 'daycount' not in args_kw.keys(): if "daycount" not in args_kw.keys():
args_kw['daycount'] = self._oldest_article args_kw["daycount"] = self._oldest_article
# single-item/detail commands # single-item/detail commands
case 'articledetail' | 'blogitem' | 'comment' | 'author' | 'documentdetail' | 'factcheckdetail' | 'infographicdetail' | 'polldetail' | 'quizdetail': case (
if 'itemid' not in args_kw.keys(): "articledetail"
| "blogitem"
| "comment"
| "author"
| "documentdetail"
| "factcheckdetail"
| "infographicdetail"
| "polldetail"
| "quizdetail"
):
if "itemid" not in args_kw.keys():
msg = "ERROR: [{}] command requires arg 'itemid'".format(cmd) msg = "ERROR: [{}] command requires arg 'itemid'".format(cmd)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
if 'Content' not in args_kw.keys(): if "Content" not in args_kw.keys():
args_kw['Content'] = self._boolean_string(self._content_inc_p) args_kw["Content"] = self._boolean_string(self._content_inc_p)
if 'Authors' not in args_kw.keys(): if "Authors" not in args_kw.keys():
args_kw['Authors'] = self._boolean_string(self._authors_p) args_kw["Authors"] = self._boolean_string(self._authors_p)
if 'html' not in args_kw.keys(): if "html" not in args_kw.keys():
args_kw['html'] = self._content_options[self._content_format] args_kw["html"] = self._content_options[self._content_format]
case 'authorid': case "authorid":
if 'authorid' not in args_kw.keys(): if "authorid" not in args_kw.keys():
msg = "ERROR: [{}] command requires arg 'authorid'".format(cmd) msg = "ERROR: [{}] command requires arg 'authorid'".format(cmd)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
case 'zone': case "zone":
if ('zoneid' not in args_kw.keys()) & ('type' not in args_kw.keys()): if ("zoneid" not in args_kw.keys()) & ("type" not in args_kw.keys()):
msg = "ERROR: [{}] command requires args 'zoneid' or 'type'".format(cmd) msg = "ERROR: [{}] command requires args 'zoneid' or 'type'".format(
cmd
)
self._logger.error(msg) self._logger.error(msg)
raise pexception.PangeaServiceException(msg) raise pexception.PangeaServiceException(msg)
# content commands # content commands
case 'articles' | 'audioclips' | 'videoclips' | 'breakingnews' | 'mostpopular' | 'topstories' | 'blogitem': case (
if 'Authors' not in args_kw.keys(): "articles"
args_kw['Authors'] = self._boolean_string(self._authors_p) | "audioclips"
| "videoclips"
| "breakingnews"
| "mostpopular"
| "topstories"
| "blogitem"
):
if "Authors" not in args_kw.keys():
args_kw["Authors"] = self._boolean_string(self._authors_p)
if 'count' not in args_kw.keys(): if "count" not in args_kw.keys():
args_kw['count'] = self._max_articles args_kw["count"] = self._max_articles
if 'daycount' not in args_kw.keys(): if "daycount" not in args_kw.keys():
args_kw['daycount'] = self._oldest_article args_kw["daycount"] = self._oldest_article
# base for all types of command (apikey needs to be first arg) # base for all types of command (apikey needs to be first arg)
url = "https://" + self._domain + self._api_path + cmd + '?apikey=' + self._api_key url = (
"https://"
+ self._domain
+ self._api_path
+ cmd
+ "?apikey="
+ self._api_key
)
# process the arg array to finish construction of the URL # process the arg array to finish construction of the URL
for key, value in args_kw.items(): for key, value in args_kw.items():
# remove this one # remove this one
if key == 'filter_date': if key == "filter_date":
continue continue
if not isinstance(key, str): if not isinstance(key, str):
@ -649,7 +691,7 @@ class PangeaService:
if not isinstance(value, str): if not isinstance(value, str):
value = str(value) value = str(value)
url += '&' + key + '=' + value url += "&" + key + "=" + value
if self._verbose_p: if self._verbose_p:
print("URL for request: " + url) print("URL for request: " + url)

View file

@ -1,6 +1,7 @@
""" """
A less-generic Exception for the Pangea API Service A less-generic Exception for the Pangea API Service
""" """
class PangeaServiceException(Exception): class PangeaServiceException(Exception):
""" An Exception specific to this API """ """An Exception specific to this API"""

View file

@ -1,56 +1,60 @@
""" """
Logger for the Pangea API Service Logger for the Pangea API Service
""" """
import logging import logging
from pygea import utilities from pygea import utilities
class PangeaServiceLogger: class PangeaServiceLogger:
""" """
Mostly, so that someone can replace this with a production logger later. Mostly, so that someone can replace this with a production logger later.
""" """
_configuration_file_name = 'pygea.ini' _configuration_file_name = "pygea.ini"
_levels = { _levels = {
"NOTSET": 0, "NOTSET": 0,
"DEBUG": 10, "DEBUG": 10,
"INFO": 20, "INFO": 20,
"WARNING": 30, "WARNING": 30,
"ERROR": 40, "ERROR": 40,
"CRITICAL": 50 "CRITICAL": 50,
} }
def __init__(self): def __init__(self):
# #
# preset from configuration file # preset from configuration file
# #
lf = utilities.get_configuration_variable('logging', 'log_file') lf = utilities.get_configuration_variable("logging", "log_file")
dl = utilities.get_configuration_variable('logging', 'default_log_level') dl = utilities.get_configuration_variable("logging", "default_log_level")
if (dl is None) | (dl not in self._levels): if (dl is None) | (dl not in self._levels):
dl = 'DEBUG' dl = "DEBUG"
self._logger = logging.getLogger('PangeaLogger') self._logger = logging.getLogger("PangeaLogger")
self._logger.propagate = False self._logger.propagate = False
logging.basicConfig( logging.basicConfig(
filename=lf, filename=lf,
level=self._levels[dl], level=self._levels[dl],
format='[%(asctime)s] %(levelname)s: %(message)s') format="[%(asctime)s] %(levelname)s: %(message)s",
)
def debug(self, message): def debug(self, message):
""" Debug message """ """Debug message"""
self._logger.debug(message) self._logger.debug(message)
def info(self, message): def info(self, message):
""" Info message """ """Info message"""
self._logger.info(message) self._logger.info(message)
def warning(self, message): def warning(self, message):
""" Warning message """ """Warning message"""
self._logger.warning(message) self._logger.warning(message)
def error(self, message): def error(self, message):
""" Error message """ """Error message"""
self._logger.error(message) self._logger.error(message)
def critical(self, message): def critical(self, message):
""" Critical message """ """Critical message"""
self._logger.critical(message) self._logger.critical(message)

View file

@ -1,47 +1,53 @@
# pylint: disable-msg=C0201 # pylint: disable-msg=C0201
""" """
- * - - * -
Utilities for the Pangea CMS Service API Utilities for the Pangea CMS Service API
- * - - * -
""" """
import hashlib import hashlib
import os import os
from configparser import ConfigParser, NoOptionError, NoSectionError
from urllib.parse import urlparse from urllib.parse import urlparse
from configparser import ConfigParser, NoSectionError, NoOptionError
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def acquire(url): def acquire(url):
""" Simple wrapper over the request object. """ """Simple wrapper over the request object."""
response = requests.get(url, timeout=20) response = requests.get(url, timeout=20)
# Check if the request was successful # Check if the request was successful
if response.status_code == 200: if response.status_code == 200:
content = response.text content = response.text
else: else:
print("Failed to retrieve the web page. Status code: " + str(response.status_code)) print(
"Failed to retrieve the web page. Status code: " + str(response.status_code)
)
return None return None
return content return content
def parse_url_elements(url): def parse_url_elements(url):
""" URL hackery - returns domain and Pangea article ID from a provided URL """ """URL hackery - returns domain and Pangea article ID from a provided URL"""
out = {} out = {}
parts = urlparse(url) parts = urlparse(url)
out['domain'] = parts.hostname out["domain"] = parts.hostname
# article ID is the file name at the end of the path ('324534.html') # article ID is the file name at the end of the path ('324534.html')
more_parts = parts.path.split('/') more_parts = parts.path.split("/")
file = more_parts[len(more_parts)-1] file = more_parts[len(more_parts) - 1]
file_parts = file.split('.') file_parts = file.split(".")
out['article_id'] = file_parts[0] out["article_id"] = file_parts[0]
return out return out
def get_webpage_metadata(page_url): def get_webpage_metadata(page_url):
""" Get HTML metadata elements from a webpage. """ """Get HTML metadata elements from a webpage."""
parsed = urlparse(page_url) parsed = urlparse(page_url)
domain = parsed.netloc domain = parsed.netloc
# #
@ -50,146 +56,154 @@ def get_webpage_metadata(page_url):
# #
html_content = acquire(page_url) html_content = acquire(page_url)
if html_content == None: if html_content == None:
return None return None
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, "html.parser")
meta_tags = soup.find_all('meta') meta_tags = soup.find_all("meta")
metadata = {} metadata = {}
for tag in meta_tags: for tag in meta_tags:
if 'name' in tag.attrs: if "name" in tag.attrs:
name = tag.attrs['name'] name = tag.attrs["name"]
content = tag.attrs.get('content', '') content = tag.attrs.get("content", "")
metadata[name] = content metadata[name] = content
elif 'property' in tag.attrs: # For OpenGraph metadata elif "property" in tag.attrs: # For OpenGraph metadata
prop = tag.attrs['property'] prop = tag.attrs["property"]
content = tag.attrs.get('content', '') content = tag.attrs.get("content", "")
metadata[prop] = content metadata[prop] = content
# add useful language property # add useful language property
html = soup.find_all('html') html = soup.find_all("html")
metadata['language'] = html[0]['lang'] metadata["language"] = html[0]["lang"]
# add links # add links
link_tags = soup.find_all('link') link_tags = soup.find_all("link")
for tag in link_tags: for tag in link_tags:
if 'rel' in tag.attrs: if "rel" in tag.attrs:
#print(json.dumps(tag.attrs, indent=4)) # print(json.dumps(tag.attrs, indent=4))
if 'alternate' in tag.attrs['rel']: if "alternate" in tag.attrs["rel"]:
if 'icon' in tag.attrs['rel']: if "icon" in tag.attrs["rel"]:
metadata['favicon'] = 'https://' + domain + tag.attrs.get('href') metadata["favicon"] = "https://" + domain + tag.attrs.get("href")
if tag.attrs['rel'][0] == 'canonical': if tag.attrs["rel"][0] == "canonical":
metadata['canonical'] = tag.attrs.get('href') metadata["canonical"] = tag.attrs.get("href")
return metadata return metadata
def get_media_metadata(image_url): def get_media_metadata(image_url):
""" Get metadata for media content from website (via response headers). """ """Get metadata for media content from website (via response headers)."""
response = requests.head(image_url, timeout=20) response = requests.head(image_url, timeout=20)
meta = None meta = None
if response.status_code == 200: if response.status_code == 200:
meta = { meta = {
"content_type": response.headers['Content-Type'], "content_type": response.headers["Content-Type"],
"content_length": response.headers['Content-Length'] "content_length": response.headers["Content-Length"],
} }
return meta return meta
def make_boolean(bool_str):
""" Convert a boolean string to an actual Boolean. """
in_str = bool_str.lower()
if (in_str != 'true') & (in_str != 'false'):
return True # following Python conventions
if in_str == 'true': def make_boolean(bool_str):
"""Convert a boolean string to an actual Boolean."""
in_str = bool_str.lower()
if (in_str != "true") & (in_str != "false"):
return True # following Python conventions
if in_str == "true":
return True return True
return False return False
def get_api_key(): def get_api_key():
""" Return the API key. PYGEA_API_KEY env var takes precedence over pygea.ini. """Return the API key. PYGEA_API_KEY env var takes precedence over pygea.ini.
Returns None if neither source provides a value. """ Returns None if neither source provides a value."""
env_key = os.environ.get('PYGEA_API_KEY') env_key = os.environ.get("PYGEA_API_KEY")
if env_key: if env_key:
return env_key return env_key
config = ConfigParser() config = ConfigParser()
config.read('pygea.ini') config.read("pygea.ini")
try: try:
return config.get('runtime', 'api_key') return config.get("runtime", "api_key")
except (NoSectionError, NoOptionError): except (NoSectionError, NoOptionError):
return None return None
def get_configuration_variable(section, vname): def get_configuration_variable(section, vname):
""" Retrieve values from the configuration file. """ """Retrieve values from the configuration file."""
config = ConfigParser() config = ConfigParser()
config.read('pygea.ini') config.read("pygea.ini")
value = config.get(section, vname) value = config.get(section, vname)
if (value == 'True') | (value == 'False'): if (value == "True") | (value == "False"):
value = make_boolean(value) value = make_boolean(value)
return value return value
def is_domain_name(domain): def is_domain_name(domain):
""" Does the provided string resemble a domain name? """ """Does the provided string resemble a domain name?"""
if any(char in domain for char in "."): if any(char in domain for char in "."):
return True return True
return False return False
def hash_site_metadata(metadata): def hash_site_metadata(metadata):
""" Create a secure hash of website HTTP meta headers to use as an RSS/ATOM ID. """ """Create a secure hash of website HTTP meta headers to use as an RSS/ATOM ID."""
sh = hashlib.sha256() sh = hashlib.sha256()
for key in metadata.keys(): for key in metadata.keys():
sh.update(key.encode('utf8') + metadata[key].encode('utf8')) sh.update(key.encode("utf8") + metadata[key].encode("utf8"))
digest = sh.hexdigest() digest = sh.hexdigest()
return digest return digest
def rss_namespace_supported(prop): def rss_namespace_supported(prop):
""" Determine if a provided RSS/XML namespace is valid in the FeedGen RSS package. """ """Determine if a provided RSS/XML namespace is valid in the FeedGen RSS package."""
supported_namespaces = [ supported_namespaces = [
'dc', "dc",
'geo', "geo",
'gen_entry', "gen_entry",
'media', "media",
'podcast', "podcast",
'podcast_entry', "podcast_entry",
'syndication', "syndication",
'torrent' "torrent",
] ]
if prop in supported_namespaces: if prop in supported_namespaces:
return True return True
return False return False
def rss_namespace_for_property(prop): def rss_namespace_for_property(prop):
""" Returns the XML namespace for a specified <channel> or <item> """Returns the XML namespace for a specified <channel> or <item>
property from among a list of the most popular namespace schemes property from among a list of the most popular namespace schemes
according to: according to:
https://www.rssboard.org/news/168/rss-channel-element-usage-stats https://www.rssboard.org/news/168/rss-channel-element-usage-stats
For an exhaustive list of namespace schemes see: For an exhaustive list of namespace schemes see:
https://validator.w3.org/feed/docs/howto/declare_namespaces.html https://validator.w3.org/feed/docs/howto/declare_namespaces.html
""" """
known_namespaces = { known_namespaces = {
'content': 'http://purl.org/rss/1.0/modules/content/', # content "content": "http://purl.org/rss/1.0/modules/content/", # content
'dc': 'http://purl.org/dc/elements/1.1/', # Dublin Core "dc": "http://purl.org/dc/elements/1.1/", # Dublin Core
'atom': 'http://www.w3.org/2005/Atom', # ATOM "atom": "http://www.w3.org/2005/Atom", # ATOM
'sy': 'http://purl.org/rss/1.0/modules/syndication/', # Syndication "sy": "http://purl.org/rss/1.0/modules/syndication/", # Syndication
'admin': 'http://webns.net/mvcb/', "admin": "http://webns.net/mvcb/",
'feedburner': 'http://rssnamespace.org/feedburner/ext/1.0', # Feedburner "feedburner": "http://rssnamespace.org/feedburner/ext/1.0", # Feedburner
'cc': 'http://web.resource.org/cc/', # copyrights "cc": "http://web.resource.org/cc/", # copyrights
'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#', "geo": "http://www.w3.org/2003/01/geo/wgs84_pos#",
'opensearch': 'http://a9.com/-/spec/opensearch/1.1/', # OpenSearch "opensearch": "http://a9.com/-/spec/opensearch/1.1/", # OpenSearch
'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', # Apple iTunes "itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd", # Apple iTunes
'blogChannel': 'http://backend.userland.com/blogChannelModule', # BlogChannel "blogChannel": "http://backend.userland.com/blogChannelModule", # BlogChannel
'media': 'http://search.yahoo.com/mrss/', # media RSS "media": "http://search.yahoo.com/mrss/", # media RSS
'icbm': 'http://postneo.com/icbm', # ICBM "icbm": "http://postneo.com/icbm", # ICBM
'cf': 'http://www.microsoft.com/schemas/rss/core/2005', # a Microsoft thing "cf": "http://www.microsoft.com/schemas/rss/core/2005", # a Microsoft thing
'podcast': 'https://podcastindex.org/namespace/1.0', # Podcast RSS "podcast": "https://podcastindex.org/namespace/1.0", # Podcast RSS
'xhtml': 'http://www.w3.org/1999/xhtml' # XHTML "xhtml": "http://www.w3.org/1999/xhtml", # XHTML
} }
components = prop.split(':') components = prop.split(":")
if known_namespaces.get(components[0]): if known_namespaces.get(components[0]):
return known_namespaces[components[0]] return known_namespaces[components[0]]

16
treefmt.nix Normal file
View file

@ -0,0 +1,16 @@
_: {
projectRootFile = "flake.nix";
programs = {
nixfmt.enable = true;
black.enable = true;
isort = {
enable = true;
profile = "black";
};
shfmt.enable = true;
};
}