republisher/repub/model.py

564 lines
18 KiB
Python
Raw Normal View History

2026-03-30 13:26:59 +02:00
from __future__ import annotations
2026-03-30 18:26:02 +02:00
import json
2026-03-30 13:26:59 +02:00
import os
from datetime import UTC, datetime
from enum import IntEnum
from importlib import resources
from importlib.resources.abc import Traversable
from pathlib import Path
2026-03-30 18:26:02 +02:00
from typing import Any
2026-03-30 13:26:59 +02:00
from peewee import (
BooleanField,
Check,
DateTimeField,
ForeignKeyField,
IntegerField,
Model,
SqliteDatabase,
TextField,
)
from playhouse.migrate import SchemaMigrator, migrate
2026-03-30 13:26:59 +02:00
DEFAULT_DB_PATH = Path("republisher.db")
DATABASE_PRAGMAS = {
"busy_timeout": 5000,
"cache_size": 15625,
"foreign_keys": 1,
"journal_mode": "wal",
"page_size": 4096,
"synchronous": "normal",
"temp_store": "memory",
}
SCHEMA_GLOB = "*.sql"
2026-03-30 18:26:02 +02:00
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
DEFAULT_MAX_CONCURRENT_JOBS = 1
2026-03-31 12:14:47 +02:00
FEED_URL_SETTING_KEY = "feed_url"
DEFAULT_FEED_URL = ""
2026-03-30 13:26:59 +02:00
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
class JobExecutionStatus(IntEnum):
PENDING = 0
RUNNING = 1
SUCCEEDED = 2
FAILED = 3
CANCELED = 4
def utc_now() -> datetime:
return datetime.now(UTC)
def resolve_database_path(db_path: str | Path | None = None) -> Path:
raw_value = (
os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH)
if db_path is None
else db_path
)
raw_path = Path(raw_value)
return raw_path.expanduser().resolve()
def schema_paths() -> tuple[Traversable, ...]:
schema_dir = resources.files("repub").joinpath("sql")
return tuple(
sorted(
(path for path in schema_dir.iterdir() if path.name.endswith(".sql")),
key=lambda path: path.name,
)
)
def initialize_database(db_path: str | Path | None = None) -> Path:
resolved_path = resolve_database_path(db_path)
resolved_path.parent.mkdir(parents=True, exist_ok=True)
if not database.is_closed():
database.close()
database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS)
database.connect(reuse_if_open=True)
try:
for path in schema_paths():
database.connection().executescript(path.read_text(encoding="utf-8"))
_run_legacy_migrations()
2026-03-30 13:26:59 +02:00
finally:
database.close()
return resolved_path
def _run_legacy_migrations() -> None:
job_columns = {column.name for column in database.get_columns("job")}
operations = []
migrator = SchemaMigrator.from_database(database)
2026-03-30 18:26:02 +02:00
if "convert_images" not in job_columns:
operations.extend(
(
migrator.add_column(
"job",
"convert_images",
BooleanField(
default=True,
constraints=[Check("convert_images IN (0, 1)")],
),
),
migrator.add_column_default("job", "convert_images", 1),
)
2026-03-30 18:26:02 +02:00
)
if "convert_video" not in job_columns:
operations.extend(
(
migrator.add_column(
"job",
"convert_video",
BooleanField(
default=True,
constraints=[Check("convert_video IN (0, 1)")],
),
),
migrator.add_column_default("job", "convert_video", 1),
)
2026-03-30 18:26:02 +02:00
)
if operations:
with database.atomic():
migrate(*operations)
2026-03-30 18:26:02 +02:00
2026-03-30 13:37:25 +02:00
def source_slug_exists(slug: str) -> bool:
with database.connection_context():
return Source.select().where(Source.slug == slug).exists()
2026-03-30 18:26:02 +02:00
def save_setting(key: str, value: Any) -> None:
payload = json.dumps(value, sort_keys=True)
with database.connection_context():
with database.atomic():
setting = AppSetting.get_or_none(AppSetting.key == key)
if setting is None:
AppSetting.create(key=key, value=payload)
return
setting.value = payload
setting.save()
def load_setting(key: str, default: Any) -> Any:
with database.connection_context():
setting = AppSetting.get_or_none(AppSetting.key == key)
if setting is None:
return default
try:
return json.loads(setting.value)
except json.JSONDecodeError:
return default
def load_max_concurrent_jobs() -> int:
value = load_setting(MAX_CONCURRENT_JOBS_SETTING_KEY, DEFAULT_MAX_CONCURRENT_JOBS)
try:
parsed = int(value)
except (TypeError, ValueError):
return DEFAULT_MAX_CONCURRENT_JOBS
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
2026-03-31 12:14:47 +02:00
def load_feed_url() -> str:
value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
return value if isinstance(value, str) else DEFAULT_FEED_URL
2026-03-30 18:26:02 +02:00
def load_settings_form() -> dict[str, object]:
2026-03-31 12:14:47 +02:00
return {
"max_concurrent_jobs": load_max_concurrent_jobs(),
"feed_url": load_feed_url(),
}
2026-03-30 18:26:02 +02:00
2026-03-30 13:49:00 +02:00
def load_source_form(slug: str) -> dict[str, object] | None:
with database.connection_context():
source = Source.get_or_none(Source.slug == slug)
if source is None:
return None
job = Job.get(Job.source == source)
form_data: dict[str, object] = {
"name": source.name,
"slug": source.slug,
"source_type": source.source_type,
"notes": source.notes,
"spider_arguments": job.spider_arguments,
"enabled": job.enabled,
2026-03-30 18:26:02 +02:00
"convert_images": job.convert_images,
"convert_video": job.convert_video,
2026-03-30 13:49:00 +02:00
"cron_minute": job.cron_minute,
"cron_hour": job.cron_hour,
"cron_day_of_month": job.cron_day_of_month,
"cron_day_of_week": job.cron_day_of_week,
"cron_month": job.cron_month,
"feed_url": "",
"pangea_domain": "",
"pangea_category": "",
"content_format": "MOBILE_3",
"content_type": "articles",
"max_articles": "10",
"oldest_article": "3",
"only_newest": True,
"include_authors": True,
"exclude_media": False,
"include_content": True,
}
if source.source_type == "feed":
feed = SourceFeed.get(SourceFeed.source == source)
form_data["feed_url"] = feed.feed_url
else:
pangea = SourcePangea.get(SourcePangea.source == source)
form_data.update(
{
"pangea_domain": pangea.domain,
"pangea_category": pangea.category_name,
"content_format": pangea.content_format,
"content_type": pangea.content_type,
"max_articles": str(pangea.max_articles),
"oldest_article": str(pangea.oldest_article),
"only_newest": pangea.only_newest,
"include_authors": pangea.include_authors,
"exclude_media": pangea.exclude_media,
"include_content": pangea.include_content,
}
)
return form_data
2026-03-30 13:37:25 +02:00
def create_source(
*,
name: str,
slug: str,
source_type: str,
notes: str,
spider_arguments: str,
enabled: bool,
cron_minute: str,
cron_hour: str,
cron_day_of_month: str,
cron_day_of_week: str,
cron_month: str,
2026-03-30 18:26:02 +02:00
convert_images: bool = True,
convert_video: bool = True,
2026-03-30 13:37:25 +02:00
feed_url: str = "",
pangea_domain: str = "",
pangea_category: str = "",
content_type: str = "",
only_newest: bool = True,
max_articles: int | None = None,
oldest_article: int | None = None,
include_authors: bool = True,
exclude_media: bool = False,
include_content: bool = True,
content_format: str = "",
) -> Source:
with database.connection_context():
with database.atomic():
source = Source.create(
name=name,
slug=slug,
source_type=source_type,
notes=notes,
)
if source_type == "feed":
SourceFeed.create(
source=source,
feed_url=feed_url,
)
else:
SourcePangea.create(
source=source,
domain=pangea_domain,
category_name=pangea_category,
content_type=content_type,
only_newest=only_newest,
max_articles=max_articles,
oldest_article=oldest_article,
include_authors=include_authors,
exclude_media=exclude_media,
include_content=include_content,
content_format=content_format,
)
Job.create(
source=source,
enabled=enabled,
2026-03-30 18:26:02 +02:00
convert_images=convert_images,
convert_video=convert_video,
2026-03-30 13:37:25 +02:00
spider_arguments=spider_arguments,
cron_minute=cron_minute,
cron_hour=cron_hour,
cron_day_of_month=cron_day_of_month,
cron_day_of_week=cron_day_of_week,
cron_month=cron_month,
)
return source
2026-03-30 13:49:00 +02:00
def update_source(
source_slug: str,
*,
name: str,
slug: str,
source_type: str,
notes: str,
spider_arguments: str,
enabled: bool,
cron_minute: str,
cron_hour: str,
cron_day_of_month: str,
cron_day_of_week: str,
cron_month: str,
2026-03-30 18:26:02 +02:00
convert_images: bool = True,
convert_video: bool = True,
2026-03-30 13:49:00 +02:00
feed_url: str = "",
pangea_domain: str = "",
pangea_category: str = "",
content_type: str = "",
only_newest: bool = True,
max_articles: int | None = None,
oldest_article: int | None = None,
include_authors: bool = True,
exclude_media: bool = False,
include_content: bool = True,
content_format: str = "",
) -> Source | None:
with database.connection_context():
with database.atomic():
source = Source.get_or_none(Source.slug == source_slug)
if source is None:
return None
source.name = name
source.notes = notes
source.source_type = source_type
source.save()
job = Job.get(Job.source == source)
job.enabled = enabled
2026-03-30 18:26:02 +02:00
job.convert_images = convert_images
job.convert_video = convert_video
2026-03-30 13:49:00 +02:00
job.spider_arguments = spider_arguments
job.cron_minute = cron_minute
job.cron_hour = cron_hour
job.cron_day_of_month = cron_day_of_month
job.cron_day_of_week = cron_day_of_week
job.cron_month = cron_month
job.save()
if source_type == "feed":
SourcePangea.delete().where(SourcePangea.source == source).execute()
feed = SourceFeed.get_or_none(SourceFeed.source == source)
if feed is None:
SourceFeed.create(source=source, feed_url=feed_url)
else:
feed.feed_url = feed_url
feed.save()
else:
SourceFeed.delete().where(SourceFeed.source == source).execute()
pangea = SourcePangea.get_or_none(SourcePangea.source == source)
if pangea is None:
SourcePangea.create(
source=source,
domain=pangea_domain,
category_name=pangea_category,
content_type=content_type,
only_newest=only_newest,
max_articles=max_articles,
oldest_article=oldest_article,
include_authors=include_authors,
exclude_media=exclude_media,
include_content=include_content,
content_format=content_format,
)
else:
pangea.domain = pangea_domain
pangea.category_name = pangea_category
pangea.content_type = content_type
pangea.only_newest = only_newest
pangea.max_articles = max_articles
pangea.oldest_article = oldest_article
pangea.include_authors = include_authors
pangea.exclude_media = exclude_media
pangea.include_content = include_content
pangea.content_format = content_format
pangea.save()
return source
2026-03-30 14:02:39 +02:00
def delete_job_source(job_id: int) -> bool:
with database.connection_context():
with database.atomic():
job = Job.get_or_none(id=job_id)
if job is None:
return False
source = Source.get_by_id(job.source_id)
return source.delete_instance() > 0
def delete_source(slug: str) -> bool:
with database.connection_context():
with database.atomic():
source = Source.get_or_none(Source.slug == slug)
if source is None:
return False
return source.delete_instance() > 0
2026-03-30 13:37:25 +02:00
def load_sources() -> tuple[dict[str, object], ...]:
with database.connection_context():
sources = tuple(Source.select().order_by(Source.created_at.desc()))
source_ids = tuple(int(source.get_id()) for source in sources)
if not source_ids:
return ()
jobs = {
job.source_id: job for job in Job.select().where(Job.source.in_(source_ids))
}
feed_configs = {
config.source_id: config
for config in SourceFeed.select().where(SourceFeed.source.in_(source_ids))
}
pangea_configs = {
config.source_id: config
for config in SourcePangea.select().where(
SourcePangea.source.in_(source_ids)
)
}
return tuple(
_project_source(source, jobs, feed_configs, pangea_configs)
for source in sources
)
def _project_source(
source: "Source",
jobs: dict[int, "Job"],
feed_configs: dict[int, "SourceFeed"],
pangea_configs: dict[int, "SourcePangea"],
) -> dict[str, object]:
source_id = int(source.get_id())
job = jobs[source_id]
if source.source_type == "feed":
upstream = feed_configs[source_id].feed_url
source_type = "Feed"
else:
pangea = pangea_configs[source_id]
upstream = f"{pangea.domain} / {pangea.category_name}"
source_type = "Pangea"
return {
"name": source.name,
"slug": source.slug,
"source_type": source_type,
"upstream": upstream,
"schedule": (
f"cron: {job.cron_minute} {job.cron_hour} {job.cron_day_of_month} "
f"{job.cron_month} {job.cron_day_of_week}"
),
"last_run": "Never run",
"state": "Enabled" if job.enabled else "Disabled",
"state_tone": "scheduled" if job.enabled else "idle",
}
2026-03-30 13:26:59 +02:00
class BaseModel(Model):
class Meta:
database = database
2026-03-30 18:26:02 +02:00
class AppSetting(BaseModel):
key = TextField(primary_key=True)
value = TextField()
class Meta:
table_name = "app_setting"
2026-03-30 13:26:59 +02:00
class Source(BaseModel):
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
name = TextField()
slug = TextField(unique=True)
source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")])
notes = TextField(default="")
class Meta:
table_name = "source"
class SourceFeed(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="feed_config")
feed_url = TextField()
etag = TextField(null=True)
last_modified = TextField(null=True)
class Meta:
table_name = "source_feed"
class SourcePangea(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="pangea_config")
domain = TextField()
category_name = TextField()
content_type = TextField()
only_newest = BooleanField()
max_articles = IntegerField()
oldest_article = IntegerField()
include_authors = BooleanField()
exclude_media = BooleanField()
include_content = BooleanField()
content_format = TextField()
class Meta:
table_name = "source_pangea"
class Job(BaseModel):
source = ForeignKeyField(Source, unique=True, backref="job")
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
enabled = BooleanField()
2026-03-30 18:26:02 +02:00
convert_images = BooleanField(default=True)
convert_video = BooleanField(default=True)
2026-03-30 13:26:59 +02:00
spider_arguments = TextField(default="")
cron_minute = TextField()
cron_hour = TextField()
cron_day_of_month = TextField()
cron_day_of_week = TextField()
cron_month = TextField()
class Meta:
table_name = "job"
class JobExecution(BaseModel):
job = ForeignKeyField(Job, backref="executions")
created_at = DateTimeField(default=utc_now)
started_at = DateTimeField(null=True)
ended_at = DateTimeField(null=True)
2026-03-30 14:02:39 +02:00
stop_requested_at = DateTimeField(null=True)
2026-03-30 13:26:59 +02:00
running_status = IntegerField(
default=JobExecutionStatus.PENDING,
constraints=[Check("running_status BETWEEN 0 AND 4")],
)
requests_count = IntegerField(default=0)
items_count = IntegerField(default=0)
warnings_count = IntegerField(default=0)
errors_count = IntegerField(default=0)
bytes_count = IntegerField(default=0)
retries_count = IntegerField(default=0)
exceptions_count = IntegerField(default=0)
cache_size_count = IntegerField(default=0)
cache_object_count = IntegerField(default=0)
raw_stats = TextField(default="{}")
class Meta:
table_name = "job_execution"