from __future__ import annotations import os from datetime import UTC, datetime from enum import IntEnum from importlib import resources from importlib.resources.abc import Traversable from pathlib import Path from peewee import ( BooleanField, Check, DateTimeField, ForeignKeyField, IntegerField, Model, SqliteDatabase, TextField, ) DEFAULT_DB_PATH = Path("republisher.db") DATABASE_PRAGMAS = { "busy_timeout": 5000, "cache_size": 15625, "foreign_keys": 1, "journal_mode": "wal", "page_size": 4096, "synchronous": "normal", "temp_store": "memory", } SCHEMA_GLOB = "*.sql" database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS) class JobExecutionStatus(IntEnum): PENDING = 0 RUNNING = 1 SUCCEEDED = 2 FAILED = 3 CANCELED = 4 def utc_now() -> datetime: return datetime.now(UTC) def resolve_database_path(db_path: str | Path | None = None) -> Path: raw_value = ( os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH) if db_path is None else db_path ) raw_path = Path(raw_value) return raw_path.expanduser().resolve() def schema_paths() -> tuple[Traversable, ...]: schema_dir = resources.files("repub").joinpath("sql") return tuple( sorted( (path for path in schema_dir.iterdir() if path.name.endswith(".sql")), key=lambda path: path.name, ) ) def initialize_database(db_path: str | Path | None = None) -> Path: resolved_path = resolve_database_path(db_path) resolved_path.parent.mkdir(parents=True, exist_ok=True) if not database.is_closed(): database.close() database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS) database.connect(reuse_if_open=True) try: connection = database.connection() for path in schema_paths(): connection.executescript(path.read_text(encoding="utf-8")) finally: database.close() return resolved_path class BaseModel(Model): class Meta: database = database class Source(BaseModel): created_at = DateTimeField(default=utc_now) updated_at = DateTimeField(default=utc_now) name = TextField() slug = TextField(unique=True) source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")]) notes = TextField(default="") class Meta: table_name = "source" class SourceFeed(BaseModel): source = ForeignKeyField(Source, primary_key=True, backref="feed_config") feed_url = TextField() etag = TextField(null=True) last_modified = TextField(null=True) class Meta: table_name = "source_feed" class SourcePangea(BaseModel): source = ForeignKeyField(Source, primary_key=True, backref="pangea_config") domain = TextField() category_name = TextField() content_type = TextField() only_newest = BooleanField() max_articles = IntegerField() oldest_article = IntegerField() include_authors = BooleanField() exclude_media = BooleanField() include_content = BooleanField() content_format = TextField() class Meta: table_name = "source_pangea" class Job(BaseModel): source = ForeignKeyField(Source, unique=True, backref="job") created_at = DateTimeField(default=utc_now) updated_at = DateTimeField(default=utc_now) enabled = BooleanField() spider_arguments = TextField(default="") cron_minute = TextField() cron_hour = TextField() cron_day_of_month = TextField() cron_day_of_week = TextField() cron_month = TextField() class Meta: table_name = "job" class JobExecution(BaseModel): job = ForeignKeyField(Job, backref="executions") created_at = DateTimeField(default=utc_now) started_at = DateTimeField(null=True) ended_at = DateTimeField(null=True) running_status = IntegerField( default=JobExecutionStatus.PENDING, constraints=[Check("running_status BETWEEN 0 AND 4")], ) requests_count = IntegerField(default=0) items_count = IntegerField(default=0) warnings_count = IntegerField(default=0) errors_count = IntegerField(default=0) bytes_count = IntegerField(default=0) retries_count = IntegerField(default=0) exceptions_count = IntegerField(default=0) cache_size_count = IntegerField(default=0) cache_object_count = IntegerField(default=0) raw_stats = TextField(default="{}") class Meta: table_name = "job_execution"