add sqlite database

This commit is contained in:
Abel Luck 2026-03-30 13:26:59 +02:00
parent 06066c2394
commit b9e288a22d
5 changed files with 440 additions and 0 deletions

168
repub/model.py Normal file
View file

@ -0,0 +1,168 @@
from __future__ import annotations
import os
from datetime import UTC, datetime
from enum import IntEnum
from importlib import resources
from importlib.resources.abc import Traversable
from pathlib import Path
from peewee import (
BooleanField,
Check,
DateTimeField,
ForeignKeyField,
IntegerField,
Model,
SqliteDatabase,
TextField,
)
DEFAULT_DB_PATH = Path("republisher.db")
DATABASE_PRAGMAS = {
"busy_timeout": 5000,
"cache_size": 15625,
"foreign_keys": 1,
"journal_mode": "wal",
"page_size": 4096,
"synchronous": "normal",
"temp_store": "memory",
}
SCHEMA_GLOB = "*.sql"
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
class JobExecutionStatus(IntEnum):
PENDING = 0
RUNNING = 1
SUCCEEDED = 2
FAILED = 3
CANCELED = 4
def utc_now() -> datetime:
return datetime.now(UTC)
def resolve_database_path(db_path: str | Path | None = None) -> Path:
raw_value = (
os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH)
if db_path is None
else db_path
)
raw_path = Path(raw_value)
return raw_path.expanduser().resolve()
def schema_paths() -> tuple[Traversable, ...]:
schema_dir = resources.files("repub").joinpath("sql")
return tuple(
sorted(
(path for path in schema_dir.iterdir() if path.name.endswith(".sql")),
key=lambda path: path.name,
)
)
def initialize_database(db_path: str | Path | None = None) -> Path:
resolved_path = resolve_database_path(db_path)
resolved_path.parent.mkdir(parents=True, exist_ok=True)
if not database.is_closed():
database.close()
database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS)
database.connect(reuse_if_open=True)
try:
connection = database.connection()
for path in schema_paths():
connection.executescript(path.read_text(encoding="utf-8"))
finally:
database.close()
return resolved_path
class BaseModel(Model):
class Meta:
database = database
class Source(BaseModel):
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
name = TextField()
slug = TextField(unique=True)
source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")])
notes = TextField(default="")
class Meta:
table_name = "source"
class SourceFeed(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="feed_config")
feed_url = TextField()
etag = TextField(null=True)
last_modified = TextField(null=True)
class Meta:
table_name = "source_feed"
class SourcePangea(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="pangea_config")
domain = TextField()
category_name = TextField()
content_type = TextField()
only_newest = BooleanField()
max_articles = IntegerField()
oldest_article = IntegerField()
include_authors = BooleanField()
exclude_media = BooleanField()
include_content = BooleanField()
content_format = TextField()
class Meta:
table_name = "source_pangea"
class Job(BaseModel):
source = ForeignKeyField(Source, unique=True, backref="job")
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
enabled = BooleanField()
spider_arguments = TextField(default="")
cron_minute = TextField()
cron_hour = TextField()
cron_day_of_month = TextField()
cron_day_of_week = TextField()
cron_month = TextField()
class Meta:
table_name = "job"
class JobExecution(BaseModel):
job = ForeignKeyField(Job, backref="executions")
created_at = DateTimeField(default=utc_now)
started_at = DateTimeField(null=True)
ended_at = DateTimeField(null=True)
running_status = IntegerField(
default=JobExecutionStatus.PENDING,
constraints=[Check("running_status BETWEEN 0 AND 4")],
)
requests_count = IntegerField(default=0)
items_count = IntegerField(default=0)
warnings_count = IntegerField(default=0)
errors_count = IntegerField(default=0)
bytes_count = IntegerField(default=0)
retries_count = IntegerField(default=0)
exceptions_count = IntegerField(default=0)
cache_size_count = IntegerField(default=0)
cache_object_count = IntegerField(default=0)
raw_stats = TextField(default="{}")
class Meta:
table_name = "job_execution"

97
repub/sql/001_initial.sql Normal file
View file

@ -0,0 +1,97 @@
CREATE TABLE IF NOT EXISTS source (
id INTEGER PRIMARY KEY,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
source_type TEXT NOT NULL CHECK (source_type IN ('feed', 'pangea')),
notes TEXT NOT NULL DEFAULT ''
);
CREATE TABLE IF NOT EXISTS source_feed (
source_id INTEGER PRIMARY KEY,
feed_url TEXT NOT NULL,
etag TEXT,
last_modified TEXT,
FOREIGN KEY (source_id) REFERENCES source(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS source_pangea (
source_id INTEGER PRIMARY KEY,
domain TEXT NOT NULL,
category_name TEXT NOT NULL,
content_type TEXT NOT NULL,
only_newest INTEGER NOT NULL CHECK (only_newest IN (0, 1)),
max_articles INTEGER NOT NULL,
oldest_article INTEGER NOT NULL,
include_authors INTEGER NOT NULL CHECK (include_authors IN (0, 1)),
exclude_media INTEGER NOT NULL CHECK (exclude_media IN (0, 1)),
include_content INTEGER NOT NULL CHECK (include_content IN (0, 1)),
content_format TEXT NOT NULL,
FOREIGN KEY (source_id) REFERENCES source(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS job (
id INTEGER PRIMARY KEY,
source_id INTEGER NOT NULL UNIQUE,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
enabled INTEGER NOT NULL CHECK (enabled IN (0, 1)),
spider_arguments TEXT NOT NULL DEFAULT '',
cron_minute TEXT NOT NULL,
cron_hour TEXT NOT NULL,
cron_day_of_month TEXT NOT NULL,
cron_day_of_week TEXT NOT NULL,
cron_month TEXT NOT NULL,
FOREIGN KEY (source_id) REFERENCES source(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS job_execution (
id INTEGER PRIMARY KEY,
job_id INTEGER NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
started_at TEXT,
ended_at TEXT,
running_status INTEGER NOT NULL DEFAULT 0 CHECK (running_status BETWEEN 0 AND 4),
requests_count INTEGER NOT NULL DEFAULT 0,
items_count INTEGER NOT NULL DEFAULT 0,
warnings_count INTEGER NOT NULL DEFAULT 0,
errors_count INTEGER NOT NULL DEFAULT 0,
bytes_count INTEGER NOT NULL DEFAULT 0,
retries_count INTEGER NOT NULL DEFAULT 0,
exceptions_count INTEGER NOT NULL DEFAULT 0,
cache_size_count INTEGER NOT NULL DEFAULT 0,
cache_object_count INTEGER NOT NULL DEFAULT 0,
raw_stats TEXT NOT NULL DEFAULT '{}',
FOREIGN KEY (job_id) REFERENCES job(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS job_enabled_idx
ON job (enabled);
CREATE INDEX IF NOT EXISTS job_execution_job_created_at_idx
ON job_execution (job_id, created_at DESC);
CREATE INDEX IF NOT EXISTS job_execution_status_started_at_idx
ON job_execution (running_status, started_at DESC);
CREATE INDEX IF NOT EXISTS job_execution_status_ended_at_idx
ON job_execution (running_status, ended_at DESC);
CREATE TRIGGER IF NOT EXISTS source_set_updated_at
AFTER UPDATE ON source
FOR EACH ROW
BEGIN
UPDATE source
SET updated_at = CURRENT_TIMESTAMP
WHERE id = NEW.id;
END;
CREATE TRIGGER IF NOT EXISTS job_set_updated_at
AFTER UPDATE ON job
FOR EACH ROW
BEGIN
UPDATE job
SET updated_at = CURRENT_TIMESTAMP
WHERE id = NEW.id;
END;