This commit is contained in:
Abel Luck 2026-03-29 13:48:30 +02:00
parent 81bb8afc41
commit 98dcea4d7e
10 changed files with 811 additions and 478 deletions

View file

@ -1,16 +1,16 @@
"""Pygea main entry point"""
import hashlib
import json
import os
from pygea import utilities
from pygea.pangeafeed import PangeaFeed
from pygea.pexception import PangeaServiceException
from pygea import utilities
OUTPUT_TO_FILE = utilities.get_configuration_variable('results', 'output_to_file_p')
OUTPUT_FILE_NAME = utilities.get_configuration_variable('results', 'output_file_name')
OUTPUT_DIRECTORY = utilities.get_configuration_variable('results', 'output_directory')
OUTPUT_TO_FILE = utilities.get_configuration_variable("results", "output_to_file_p")
OUTPUT_FILE_NAME = utilities.get_configuration_variable("results", "output_file_name")
OUTPUT_DIRECTORY = utilities.get_configuration_variable("results", "output_directory")
def write_manifest(categories):
@ -22,61 +22,71 @@ def write_manifest(categories):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
manifest_path = os.path.join(output_directory, 'manifest.json')
with open(manifest_path, 'w', encoding='utf-8') as mfile:
json.dump({'categories': categories}, mfile, indent=2, ensure_ascii=False)
mfile.write('\n')
manifest_path = os.path.join(output_directory, "manifest.json")
with open(manifest_path, "w", encoding="utf-8") as mfile:
json.dump({"categories": categories}, mfile, indent=2, ensure_ascii=False)
mfile.write("\n")
def main():
# Feeds are generated for a single, specified, domain
domain = 'www.martinoticias.com'
domain = "www.martinoticias.com"
args = {
# tuple values:
# [0] category name or a string representing a content query
# [1] only the newest content desired (as configured in pygea.ini)?
# [2] special content_type for this category only (from the approved list of types)
'categories': [
('Titulares',True, None),
('Cuba', True, None),
('América Latina', True, None),
('Info Martí ', False, None), # YES! this category name has a space character at the end!
('Noticiero Martí Noticias', True, None)
"categories": [
("Titulares", True, None),
("Cuba", True, None),
("América Latina", True, None),
(
"Info Martí ",
False,
None,
), # YES! this category name has a space character at the end!
("Noticiero Martí Noticias", True, None),
],
'default_content_type': "articles"
"default_content_type": "articles",
}
# TWO OPTIONS from the args defined above:
# 1. Generate a single feed from the defined categories
#try:
# try:
# pf = PangeaFeed(domain, args)
# pf.acquire_content()
# pf.generate_feed()
# pf.disgorge()
#except PangeaServiceException as error:
# except PangeaServiceException as error:
# print(error)
# 2. Generate different feeds for each defined category
try:
manifest_categories = []
for cat_tuple in args['categories']:
for cat_tuple in args["categories"]:
# form new args for each category/query
newargs = {
'categories': [cat_tuple],
'default_content_type': "articles"
}
newargs = {"categories": [cat_tuple], "default_content_type": "articles"}
pf = PangeaFeed(domain, newargs)
pf.acquire_content()
pf.generate_feed()
# put each feed into a different sub-directory
feed_subdir = hashlib.md5(cat_tuple[0].encode('utf-8')).hexdigest()[:7]
feed_subdir = hashlib.md5(cat_tuple[0].encode("utf-8")).hexdigest()[:7]
pf.disgorge(feed_subdir)
manifest_categories.append({
'name': cat_tuple[0],
'short-hash': feed_subdir,
'local-path': os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(os.sep, '/')
})
print("feed for {} output to sub-directory {}".format(cat_tuple[0], feed_subdir))
manifest_categories.append(
{
"name": cat_tuple[0],
"short-hash": feed_subdir,
"local-path": os.path.join(feed_subdir, OUTPUT_FILE_NAME).replace(
os.sep, "/"
),
}
)
print(
"feed for {} output to sub-directory {}".format(
cat_tuple[0], feed_subdir
)
)
write_manifest(manifest_categories)
except PangeaServiceException as error:
print(error)