switch to TOML config and export republisher feed manifests

This commit is contained in:
Abel Luck 2026-03-29 14:46:57 +02:00
parent 98dcea4d7e
commit 897af2872c
17 changed files with 832 additions and 324 deletions

View file

@ -1,96 +1,107 @@
"""Pygea main entry point"""
"""Pygea main entry point."""
import hashlib
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from pygea import utilities
from pygea.pangeafeed import PangeaFeed
from pygea.config import FeedDefinition, PygeaConfig, load_config
from pygea.pexception import PangeaServiceException
OUTPUT_TO_FILE = utilities.get_configuration_variable("results", "output_to_file_p")
OUTPUT_FILE_NAME = utilities.get_configuration_variable("results", "output_file_name")
OUTPUT_DIRECTORY = utilities.get_configuration_variable("results", "output_directory")
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate RSS feeds from Pangea")
parser.add_argument(
"-c",
"--config",
default="pygea.toml",
help="Path to runtime config TOML file",
)
return parser.parse_args(argv)
def write_manifest(categories):
"""Write the category manifest beside the generated feed output."""
if OUTPUT_TO_FILE is not True:
def _toml_string(value: str) -> str:
return json.dumps(value, ensure_ascii=False)
def render_manifest(feeds: list[dict[str, str]]) -> str:
lines: list[str] = []
for feed in feeds:
lines.extend(
[
"[[feeds]]",
f"name = {_toml_string(feed['name'])}",
f"slug = {_toml_string(feed['slug'])}",
f"url = {_toml_string(feed['url'])}",
"",
]
)
return "\n".join(lines).rstrip() + "\n"
def write_manifest(config: PygeaConfig, manifest_feeds: list[dict[str, str]]) -> None:
"""Write the feed manifest beside the generated feed output."""
if config.results.output_to_file_p is not True:
return
output_directory = os.path.normpath(OUTPUT_DIRECTORY)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
manifest_path = os.path.join(output_directory, "manifest.json")
with open(manifest_path, "w", encoding="utf-8") as mfile:
json.dump({"categories": categories}, mfile, indent=2, ensure_ascii=False)
mfile.write("\n")
config.results.output_directory.mkdir(parents=True, exist_ok=True)
manifest_path = config.results.output_directory / "manifest.toml"
manifest_path.write_text(render_manifest(manifest_feeds), encoding="utf-8")
def main():
# Feeds are generated for a single, specified, domain
domain = "www.martinoticias.com"
def feed_class():
from pygea.pangeafeed import PangeaFeed
args = {
# tuple values:
# [0] category name or a string representing a content query
# [1] only the newest content desired (as configured in pygea.ini)?
# [2] special content_type for this category only (from the approved list of types)
"categories": [
("Titulares", True, None),
("Cuba", True, None),
("América Latina", True, None),
(
"Info Martí ",
False,
None,
), # YES! this category name has a space character at the end!
("Noticiero Martí Noticias", True, None),
],
"default_content_type": "articles",
}
return PangeaFeed
# TWO OPTIONS from the args defined above:
# 1. Generate a single feed from the defined categories
# try:
# pf = PangeaFeed(domain, args)
# pf.acquire_content()
# pf.generate_feed()
# pf.disgorge()
# except PangeaServiceException as error:
# print(error)
# 2. Generate different feeds for each defined category
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
try:
manifest_categories = []
for cat_tuple in args["categories"]:
# form new args for each category/query
newargs = {"categories": [cat_tuple], "default_content_type": "articles"}
pf = PangeaFeed(domain, newargs)
config = load_config(args.config)
except FileNotFoundError:
print(
"Config file not found: {}".format(Path(args.config).expanduser()),
file=sys.stderr,
)
print(
"Use --config PATH or create pygea.toml in the project root",
file=sys.stderr,
)
return 2
try:
manifest_feeds: list[dict[str, str]] = []
pangea_feed_class = feed_class()
for feed in config.feeds:
pf = pangea_feed_class(config, [feed])
pf.acquire_content()
pf.generate_feed()
# put each feed into a different sub-directory
feed_subdir = hashlib.md5(cat_tuple[0].encode("utf-8")).hexdigest()[:7]
pf.disgorge(feed_subdir)
manifest_categories.append(
{
"name": cat_tuple[0],
"short-hash": feed_subdir,
"local-path": os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(
os.sep, "/"
),
}
)
print(
"feed for {} output to sub-directory {}".format(
cat_tuple[0], feed_subdir
output_path = pf.disgorge(feed["slug"])
if output_path is not None:
manifest_feeds.append(_manifest_entry(feed, output_path))
print(
"feed for {} output to sub-directory {}".format(
feed["name"], feed["slug"]
)
)
)
write_manifest(manifest_categories)
write_manifest(config, manifest_feeds)
except PangeaServiceException as error:
print(error)
print(error, file=sys.stderr)
return 1
return 0
def _manifest_entry(feed: FeedDefinition, output_path: Path) -> dict[str, str]:
return {
"name": feed["name"],
"slug": feed["slug"],
"url": output_path.resolve().as_uri(),
}
if __name__ == "__main__":
main()
sys.exit(main())