switch to TOML config and export republisher feed manifests
This commit is contained in:
parent
98dcea4d7e
commit
897af2872c
17 changed files with 832 additions and 324 deletions
159
pygea/main.py
159
pygea/main.py
|
|
@ -1,96 +1,107 @@
|
|||
"""Pygea main entry point"""
|
||||
"""Pygea main entry point."""
|
||||
|
||||
import hashlib
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from pygea import utilities
|
||||
from pygea.pangeafeed import PangeaFeed
|
||||
from pygea.config import FeedDefinition, PygeaConfig, load_config
|
||||
from pygea.pexception import PangeaServiceException
|
||||
|
||||
OUTPUT_TO_FILE = utilities.get_configuration_variable("results", "output_to_file_p")
|
||||
OUTPUT_FILE_NAME = utilities.get_configuration_variable("results", "output_file_name")
|
||||
OUTPUT_DIRECTORY = utilities.get_configuration_variable("results", "output_directory")
|
||||
|
||||
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate RSS feeds from Pangea")
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--config",
|
||||
default="pygea.toml",
|
||||
help="Path to runtime config TOML file",
|
||||
)
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def write_manifest(categories):
|
||||
"""Write the category manifest beside the generated feed output."""
|
||||
if OUTPUT_TO_FILE is not True:
|
||||
def _toml_string(value: str) -> str:
|
||||
return json.dumps(value, ensure_ascii=False)
|
||||
|
||||
|
||||
def render_manifest(feeds: list[dict[str, str]]) -> str:
|
||||
lines: list[str] = []
|
||||
for feed in feeds:
|
||||
lines.extend(
|
||||
[
|
||||
"[[feeds]]",
|
||||
f"name = {_toml_string(feed['name'])}",
|
||||
f"slug = {_toml_string(feed['slug'])}",
|
||||
f"url = {_toml_string(feed['url'])}",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def write_manifest(config: PygeaConfig, manifest_feeds: list[dict[str, str]]) -> None:
|
||||
"""Write the feed manifest beside the generated feed output."""
|
||||
if config.results.output_to_file_p is not True:
|
||||
return
|
||||
|
||||
output_directory = os.path.normpath(OUTPUT_DIRECTORY)
|
||||
if not os.path.exists(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
|
||||
manifest_path = os.path.join(output_directory, "manifest.json")
|
||||
with open(manifest_path, "w", encoding="utf-8") as mfile:
|
||||
json.dump({"categories": categories}, mfile, indent=2, ensure_ascii=False)
|
||||
mfile.write("\n")
|
||||
config.results.output_directory.mkdir(parents=True, exist_ok=True)
|
||||
manifest_path = config.results.output_directory / "manifest.toml"
|
||||
manifest_path.write_text(render_manifest(manifest_feeds), encoding="utf-8")
|
||||
|
||||
|
||||
def main():
|
||||
# Feeds are generated for a single, specified, domain
|
||||
domain = "www.martinoticias.com"
|
||||
def feed_class():
|
||||
from pygea.pangeafeed import PangeaFeed
|
||||
|
||||
args = {
|
||||
# tuple values:
|
||||
# [0] category name or a string representing a content query
|
||||
# [1] only the newest content desired (as configured in pygea.ini)?
|
||||
# [2] special content_type for this category only (from the approved list of types)
|
||||
"categories": [
|
||||
("Titulares", True, None),
|
||||
("Cuba", True, None),
|
||||
("América Latina", True, None),
|
||||
(
|
||||
"Info Martí ",
|
||||
False,
|
||||
None,
|
||||
), # YES! this category name has a space character at the end!
|
||||
("Noticiero Martí Noticias", True, None),
|
||||
],
|
||||
"default_content_type": "articles",
|
||||
}
|
||||
return PangeaFeed
|
||||
|
||||
# TWO OPTIONS from the args defined above:
|
||||
# 1. Generate a single feed from the defined categories
|
||||
# try:
|
||||
# pf = PangeaFeed(domain, args)
|
||||
# pf.acquire_content()
|
||||
# pf.generate_feed()
|
||||
# pf.disgorge()
|
||||
# except PangeaServiceException as error:
|
||||
# print(error)
|
||||
|
||||
# 2. Generate different feeds for each defined category
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = parse_args(argv)
|
||||
try:
|
||||
manifest_categories = []
|
||||
for cat_tuple in args["categories"]:
|
||||
# form new args for each category/query
|
||||
newargs = {"categories": [cat_tuple], "default_content_type": "articles"}
|
||||
pf = PangeaFeed(domain, newargs)
|
||||
config = load_config(args.config)
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
"Config file not found: {}".format(Path(args.config).expanduser()),
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
"Use --config PATH or create pygea.toml in the project root",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
try:
|
||||
manifest_feeds: list[dict[str, str]] = []
|
||||
pangea_feed_class = feed_class()
|
||||
for feed in config.feeds:
|
||||
pf = pangea_feed_class(config, [feed])
|
||||
pf.acquire_content()
|
||||
pf.generate_feed()
|
||||
# put each feed into a different sub-directory
|
||||
feed_subdir = hashlib.md5(cat_tuple[0].encode("utf-8")).hexdigest()[:7]
|
||||
pf.disgorge(feed_subdir)
|
||||
manifest_categories.append(
|
||||
{
|
||||
"name": cat_tuple[0],
|
||||
"short-hash": feed_subdir,
|
||||
"local-path": os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(
|
||||
os.sep, "/"
|
||||
),
|
||||
}
|
||||
)
|
||||
print(
|
||||
"feed for {} output to sub-directory {}".format(
|
||||
cat_tuple[0], feed_subdir
|
||||
output_path = pf.disgorge(feed["slug"])
|
||||
if output_path is not None:
|
||||
manifest_feeds.append(_manifest_entry(feed, output_path))
|
||||
print(
|
||||
"feed for {} output to sub-directory {}".format(
|
||||
feed["name"], feed["slug"]
|
||||
)
|
||||
)
|
||||
)
|
||||
write_manifest(manifest_categories)
|
||||
write_manifest(config, manifest_feeds)
|
||||
except PangeaServiceException as error:
|
||||
print(error)
|
||||
print(error, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _manifest_entry(feed: FeedDefinition, output_path: Path) -> dict[str, str]:
|
||||
return {
|
||||
"name": feed["name"],
|
||||
"slug": feed["slug"],
|
||||
"url": output_path.resolve().as_uri(),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
sys.exit(main())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue