294 lines
8.4 KiB
Python
294 lines
8.4 KiB
Python
from __future__ import annotations
|
|
|
|
import os
|
|
from datetime import UTC, datetime
|
|
from enum import IntEnum
|
|
from importlib import resources
|
|
from importlib.resources.abc import Traversable
|
|
from pathlib import Path
|
|
|
|
from peewee import (
|
|
BooleanField,
|
|
Check,
|
|
DateTimeField,
|
|
ForeignKeyField,
|
|
IntegerField,
|
|
Model,
|
|
SqliteDatabase,
|
|
TextField,
|
|
)
|
|
|
|
DEFAULT_DB_PATH = Path("republisher.db")
|
|
DATABASE_PRAGMAS = {
|
|
"busy_timeout": 5000,
|
|
"cache_size": 15625,
|
|
"foreign_keys": 1,
|
|
"journal_mode": "wal",
|
|
"page_size": 4096,
|
|
"synchronous": "normal",
|
|
"temp_store": "memory",
|
|
}
|
|
SCHEMA_GLOB = "*.sql"
|
|
|
|
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
|
|
|
|
|
|
class JobExecutionStatus(IntEnum):
|
|
PENDING = 0
|
|
RUNNING = 1
|
|
SUCCEEDED = 2
|
|
FAILED = 3
|
|
CANCELED = 4
|
|
|
|
|
|
def utc_now() -> datetime:
|
|
return datetime.now(UTC)
|
|
|
|
|
|
def resolve_database_path(db_path: str | Path | None = None) -> Path:
|
|
raw_value = (
|
|
os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH)
|
|
if db_path is None
|
|
else db_path
|
|
)
|
|
raw_path = Path(raw_value)
|
|
return raw_path.expanduser().resolve()
|
|
|
|
|
|
def schema_paths() -> tuple[Traversable, ...]:
|
|
schema_dir = resources.files("repub").joinpath("sql")
|
|
return tuple(
|
|
sorted(
|
|
(path for path in schema_dir.iterdir() if path.name.endswith(".sql")),
|
|
key=lambda path: path.name,
|
|
)
|
|
)
|
|
|
|
|
|
def initialize_database(db_path: str | Path | None = None) -> Path:
|
|
resolved_path = resolve_database_path(db_path)
|
|
resolved_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if not database.is_closed():
|
|
database.close()
|
|
|
|
database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS)
|
|
database.connect(reuse_if_open=True)
|
|
try:
|
|
connection = database.connection()
|
|
for path in schema_paths():
|
|
connection.executescript(path.read_text(encoding="utf-8"))
|
|
finally:
|
|
database.close()
|
|
|
|
return resolved_path
|
|
|
|
|
|
def source_slug_exists(slug: str) -> bool:
|
|
with database.connection_context():
|
|
return Source.select().where(Source.slug == slug).exists()
|
|
|
|
|
|
def create_source(
|
|
*,
|
|
name: str,
|
|
slug: str,
|
|
source_type: str,
|
|
notes: str,
|
|
spider_arguments: str,
|
|
enabled: bool,
|
|
cron_minute: str,
|
|
cron_hour: str,
|
|
cron_day_of_month: str,
|
|
cron_day_of_week: str,
|
|
cron_month: str,
|
|
feed_url: str = "",
|
|
pangea_domain: str = "",
|
|
pangea_category: str = "",
|
|
content_type: str = "",
|
|
only_newest: bool = True,
|
|
max_articles: int | None = None,
|
|
oldest_article: int | None = None,
|
|
include_authors: bool = True,
|
|
exclude_media: bool = False,
|
|
include_content: bool = True,
|
|
content_format: str = "",
|
|
) -> Source:
|
|
with database.connection_context():
|
|
with database.atomic():
|
|
source = Source.create(
|
|
name=name,
|
|
slug=slug,
|
|
source_type=source_type,
|
|
notes=notes,
|
|
)
|
|
if source_type == "feed":
|
|
SourceFeed.create(
|
|
source=source,
|
|
feed_url=feed_url,
|
|
)
|
|
else:
|
|
SourcePangea.create(
|
|
source=source,
|
|
domain=pangea_domain,
|
|
category_name=pangea_category,
|
|
content_type=content_type,
|
|
only_newest=only_newest,
|
|
max_articles=max_articles,
|
|
oldest_article=oldest_article,
|
|
include_authors=include_authors,
|
|
exclude_media=exclude_media,
|
|
include_content=include_content,
|
|
content_format=content_format,
|
|
)
|
|
Job.create(
|
|
source=source,
|
|
enabled=enabled,
|
|
spider_arguments=spider_arguments,
|
|
cron_minute=cron_minute,
|
|
cron_hour=cron_hour,
|
|
cron_day_of_month=cron_day_of_month,
|
|
cron_day_of_week=cron_day_of_week,
|
|
cron_month=cron_month,
|
|
)
|
|
return source
|
|
|
|
|
|
def load_sources() -> tuple[dict[str, object], ...]:
|
|
with database.connection_context():
|
|
sources = tuple(Source.select().order_by(Source.created_at.desc()))
|
|
source_ids = tuple(int(source.get_id()) for source in sources)
|
|
if not source_ids:
|
|
return ()
|
|
jobs = {
|
|
job.source_id: job for job in Job.select().where(Job.source.in_(source_ids))
|
|
}
|
|
feed_configs = {
|
|
config.source_id: config
|
|
for config in SourceFeed.select().where(SourceFeed.source.in_(source_ids))
|
|
}
|
|
pangea_configs = {
|
|
config.source_id: config
|
|
for config in SourcePangea.select().where(
|
|
SourcePangea.source.in_(source_ids)
|
|
)
|
|
}
|
|
return tuple(
|
|
_project_source(source, jobs, feed_configs, pangea_configs)
|
|
for source in sources
|
|
)
|
|
|
|
|
|
def _project_source(
|
|
source: "Source",
|
|
jobs: dict[int, "Job"],
|
|
feed_configs: dict[int, "SourceFeed"],
|
|
pangea_configs: dict[int, "SourcePangea"],
|
|
) -> dict[str, object]:
|
|
source_id = int(source.get_id())
|
|
job = jobs[source_id]
|
|
if source.source_type == "feed":
|
|
upstream = feed_configs[source_id].feed_url
|
|
source_type = "Feed"
|
|
else:
|
|
pangea = pangea_configs[source_id]
|
|
upstream = f"{pangea.domain} / {pangea.category_name}"
|
|
source_type = "Pangea"
|
|
|
|
return {
|
|
"name": source.name,
|
|
"slug": source.slug,
|
|
"source_type": source_type,
|
|
"upstream": upstream,
|
|
"schedule": (
|
|
f"cron: {job.cron_minute} {job.cron_hour} {job.cron_day_of_month} "
|
|
f"{job.cron_month} {job.cron_day_of_week}"
|
|
),
|
|
"last_run": "Never run",
|
|
"state": "Enabled" if job.enabled else "Disabled",
|
|
"state_tone": "scheduled" if job.enabled else "idle",
|
|
}
|
|
|
|
|
|
class BaseModel(Model):
|
|
class Meta:
|
|
database = database
|
|
|
|
|
|
class Source(BaseModel):
|
|
created_at = DateTimeField(default=utc_now)
|
|
updated_at = DateTimeField(default=utc_now)
|
|
name = TextField()
|
|
slug = TextField(unique=True)
|
|
source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")])
|
|
notes = TextField(default="")
|
|
|
|
class Meta:
|
|
table_name = "source"
|
|
|
|
|
|
class SourceFeed(BaseModel):
|
|
source = ForeignKeyField(Source, primary_key=True, backref="feed_config")
|
|
feed_url = TextField()
|
|
etag = TextField(null=True)
|
|
last_modified = TextField(null=True)
|
|
|
|
class Meta:
|
|
table_name = "source_feed"
|
|
|
|
|
|
class SourcePangea(BaseModel):
|
|
source = ForeignKeyField(Source, primary_key=True, backref="pangea_config")
|
|
domain = TextField()
|
|
category_name = TextField()
|
|
content_type = TextField()
|
|
only_newest = BooleanField()
|
|
max_articles = IntegerField()
|
|
oldest_article = IntegerField()
|
|
include_authors = BooleanField()
|
|
exclude_media = BooleanField()
|
|
include_content = BooleanField()
|
|
content_format = TextField()
|
|
|
|
class Meta:
|
|
table_name = "source_pangea"
|
|
|
|
|
|
class Job(BaseModel):
|
|
source = ForeignKeyField(Source, unique=True, backref="job")
|
|
created_at = DateTimeField(default=utc_now)
|
|
updated_at = DateTimeField(default=utc_now)
|
|
enabled = BooleanField()
|
|
spider_arguments = TextField(default="")
|
|
cron_minute = TextField()
|
|
cron_hour = TextField()
|
|
cron_day_of_month = TextField()
|
|
cron_day_of_week = TextField()
|
|
cron_month = TextField()
|
|
|
|
class Meta:
|
|
table_name = "job"
|
|
|
|
|
|
class JobExecution(BaseModel):
|
|
job = ForeignKeyField(Job, backref="executions")
|
|
created_at = DateTimeField(default=utc_now)
|
|
started_at = DateTimeField(null=True)
|
|
ended_at = DateTimeField(null=True)
|
|
running_status = IntegerField(
|
|
default=JobExecutionStatus.PENDING,
|
|
constraints=[Check("running_status BETWEEN 0 AND 4")],
|
|
)
|
|
requests_count = IntegerField(default=0)
|
|
items_count = IntegerField(default=0)
|
|
warnings_count = IntegerField(default=0)
|
|
errors_count = IntegerField(default=0)
|
|
bytes_count = IntegerField(default=0)
|
|
retries_count = IntegerField(default=0)
|
|
exceptions_count = IntegerField(default=0)
|
|
cache_size_count = IntegerField(default=0)
|
|
cache_object_count = IntegerField(default=0)
|
|
raw_stats = TextField(default="{}")
|
|
|
|
class Meta:
|
|
table_name = "job_execution"
|