republisher/repub/model.py

563 lines
18 KiB
Python

from __future__ import annotations
import json
import os
from datetime import UTC, datetime
from enum import IntEnum
from importlib import resources
from importlib.resources.abc import Traversable
from pathlib import Path
from typing import Any
from peewee import (
BooleanField,
Check,
DateTimeField,
ForeignKeyField,
IntegerField,
Model,
SqliteDatabase,
TextField,
)
from playhouse.migrate import SchemaMigrator, migrate
DEFAULT_DB_PATH = Path("republisher.db")
DATABASE_PRAGMAS = {
"busy_timeout": 5000,
"cache_size": 15625,
"foreign_keys": 1,
"journal_mode": "wal",
"page_size": 4096,
"synchronous": "normal",
"temp_store": "memory",
}
SCHEMA_GLOB = "*.sql"
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
DEFAULT_MAX_CONCURRENT_JOBS = 1
FEED_URL_SETTING_KEY = "feed_url"
DEFAULT_FEED_URL = ""
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
class JobExecutionStatus(IntEnum):
PENDING = 0
RUNNING = 1
SUCCEEDED = 2
FAILED = 3
CANCELED = 4
def utc_now() -> datetime:
return datetime.now(UTC)
def resolve_database_path(db_path: str | Path | None = None) -> Path:
raw_value = (
os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH)
if db_path is None
else db_path
)
raw_path = Path(raw_value)
return raw_path.expanduser().resolve()
def schema_paths() -> tuple[Traversable, ...]:
schema_dir = resources.files("repub").joinpath("sql")
return tuple(
sorted(
(path for path in schema_dir.iterdir() if path.name.endswith(".sql")),
key=lambda path: path.name,
)
)
def initialize_database(db_path: str | Path | None = None) -> Path:
resolved_path = resolve_database_path(db_path)
resolved_path.parent.mkdir(parents=True, exist_ok=True)
if not database.is_closed():
database.close()
database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS)
database.connect(reuse_if_open=True)
try:
for path in schema_paths():
database.connection().executescript(path.read_text(encoding="utf-8"))
_run_legacy_migrations()
finally:
database.close()
return resolved_path
def _run_legacy_migrations() -> None:
job_columns = {column.name for column in database.get_columns("job")}
operations = []
migrator = SchemaMigrator.from_database(database)
if "convert_images" not in job_columns:
operations.extend(
(
migrator.add_column(
"job",
"convert_images",
BooleanField(
default=True,
constraints=[Check("convert_images IN (0, 1)")],
),
),
migrator.add_column_default("job", "convert_images", 1),
)
)
if "convert_video" not in job_columns:
operations.extend(
(
migrator.add_column(
"job",
"convert_video",
BooleanField(
default=True,
constraints=[Check("convert_video IN (0, 1)")],
),
),
migrator.add_column_default("job", "convert_video", 1),
)
)
if operations:
with database.atomic():
migrate(*operations)
def source_slug_exists(slug: str) -> bool:
with database.connection_context():
return Source.select().where(Source.slug == slug).exists()
def save_setting(key: str, value: Any) -> None:
payload = json.dumps(value, sort_keys=True)
with database.connection_context():
with database.atomic():
setting = AppSetting.get_or_none(AppSetting.key == key)
if setting is None:
AppSetting.create(key=key, value=payload)
return
setting.value = payload
setting.save()
def load_setting(key: str, default: Any) -> Any:
with database.connection_context():
setting = AppSetting.get_or_none(AppSetting.key == key)
if setting is None:
return default
try:
return json.loads(setting.value)
except json.JSONDecodeError:
return default
def load_max_concurrent_jobs() -> int:
value = load_setting(MAX_CONCURRENT_JOBS_SETTING_KEY, DEFAULT_MAX_CONCURRENT_JOBS)
try:
parsed = int(value)
except (TypeError, ValueError):
return DEFAULT_MAX_CONCURRENT_JOBS
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
def load_feed_url() -> str:
value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
return value if isinstance(value, str) else DEFAULT_FEED_URL
def load_settings_form() -> dict[str, object]:
return {
"max_concurrent_jobs": load_max_concurrent_jobs(),
"feed_url": load_feed_url(),
}
def load_source_form(slug: str) -> dict[str, object] | None:
with database.connection_context():
source = Source.get_or_none(Source.slug == slug)
if source is None:
return None
job = Job.get(Job.source == source)
form_data: dict[str, object] = {
"name": source.name,
"slug": source.slug,
"source_type": source.source_type,
"notes": source.notes,
"spider_arguments": job.spider_arguments,
"enabled": job.enabled,
"convert_images": job.convert_images,
"convert_video": job.convert_video,
"cron_minute": job.cron_minute,
"cron_hour": job.cron_hour,
"cron_day_of_month": job.cron_day_of_month,
"cron_day_of_week": job.cron_day_of_week,
"cron_month": job.cron_month,
"feed_url": "",
"pangea_domain": "",
"pangea_category": "",
"content_format": "MOBILE_3",
"content_type": "articles",
"max_articles": "10",
"oldest_article": "3",
"only_newest": True,
"include_authors": True,
"exclude_media": False,
"include_content": True,
}
if source.source_type == "feed":
feed = SourceFeed.get(SourceFeed.source == source)
form_data["feed_url"] = feed.feed_url
else:
pangea = SourcePangea.get(SourcePangea.source == source)
form_data.update(
{
"pangea_domain": pangea.domain,
"pangea_category": pangea.category_name,
"content_format": pangea.content_format,
"content_type": pangea.content_type,
"max_articles": str(pangea.max_articles),
"oldest_article": str(pangea.oldest_article),
"only_newest": pangea.only_newest,
"include_authors": pangea.include_authors,
"exclude_media": pangea.exclude_media,
"include_content": pangea.include_content,
}
)
return form_data
def create_source(
*,
name: str,
slug: str,
source_type: str,
notes: str,
spider_arguments: str,
enabled: bool,
cron_minute: str,
cron_hour: str,
cron_day_of_month: str,
cron_day_of_week: str,
cron_month: str,
convert_images: bool = True,
convert_video: bool = True,
feed_url: str = "",
pangea_domain: str = "",
pangea_category: str = "",
content_type: str = "",
only_newest: bool = True,
max_articles: int | None = None,
oldest_article: int | None = None,
include_authors: bool = True,
exclude_media: bool = False,
include_content: bool = True,
content_format: str = "",
) -> Source:
with database.connection_context():
with database.atomic():
source = Source.create(
name=name,
slug=slug,
source_type=source_type,
notes=notes,
)
if source_type == "feed":
SourceFeed.create(
source=source,
feed_url=feed_url,
)
else:
SourcePangea.create(
source=source,
domain=pangea_domain,
category_name=pangea_category,
content_type=content_type,
only_newest=only_newest,
max_articles=max_articles,
oldest_article=oldest_article,
include_authors=include_authors,
exclude_media=exclude_media,
include_content=include_content,
content_format=content_format,
)
Job.create(
source=source,
enabled=enabled,
convert_images=convert_images,
convert_video=convert_video,
spider_arguments=spider_arguments,
cron_minute=cron_minute,
cron_hour=cron_hour,
cron_day_of_month=cron_day_of_month,
cron_day_of_week=cron_day_of_week,
cron_month=cron_month,
)
return source
def update_source(
source_slug: str,
*,
name: str,
slug: str,
source_type: str,
notes: str,
spider_arguments: str,
enabled: bool,
cron_minute: str,
cron_hour: str,
cron_day_of_month: str,
cron_day_of_week: str,
cron_month: str,
convert_images: bool = True,
convert_video: bool = True,
feed_url: str = "",
pangea_domain: str = "",
pangea_category: str = "",
content_type: str = "",
only_newest: bool = True,
max_articles: int | None = None,
oldest_article: int | None = None,
include_authors: bool = True,
exclude_media: bool = False,
include_content: bool = True,
content_format: str = "",
) -> Source | None:
with database.connection_context():
with database.atomic():
source = Source.get_or_none(Source.slug == source_slug)
if source is None:
return None
source.name = name
source.notes = notes
source.source_type = source_type
source.save()
job = Job.get(Job.source == source)
job.enabled = enabled
job.convert_images = convert_images
job.convert_video = convert_video
job.spider_arguments = spider_arguments
job.cron_minute = cron_minute
job.cron_hour = cron_hour
job.cron_day_of_month = cron_day_of_month
job.cron_day_of_week = cron_day_of_week
job.cron_month = cron_month
job.save()
if source_type == "feed":
SourcePangea.delete().where(SourcePangea.source == source).execute()
feed = SourceFeed.get_or_none(SourceFeed.source == source)
if feed is None:
SourceFeed.create(source=source, feed_url=feed_url)
else:
feed.feed_url = feed_url
feed.save()
else:
SourceFeed.delete().where(SourceFeed.source == source).execute()
pangea = SourcePangea.get_or_none(SourcePangea.source == source)
if pangea is None:
SourcePangea.create(
source=source,
domain=pangea_domain,
category_name=pangea_category,
content_type=content_type,
only_newest=only_newest,
max_articles=max_articles,
oldest_article=oldest_article,
include_authors=include_authors,
exclude_media=exclude_media,
include_content=include_content,
content_format=content_format,
)
else:
pangea.domain = pangea_domain
pangea.category_name = pangea_category
pangea.content_type = content_type
pangea.only_newest = only_newest
pangea.max_articles = max_articles
pangea.oldest_article = oldest_article
pangea.include_authors = include_authors
pangea.exclude_media = exclude_media
pangea.include_content = include_content
pangea.content_format = content_format
pangea.save()
return source
def delete_job_source(job_id: int) -> bool:
with database.connection_context():
with database.atomic():
job = Job.get_or_none(id=job_id)
if job is None:
return False
source = Source.get_by_id(job.source_id)
return source.delete_instance() > 0
def delete_source(slug: str) -> bool:
with database.connection_context():
with database.atomic():
source = Source.get_or_none(Source.slug == slug)
if source is None:
return False
return source.delete_instance() > 0
def load_sources() -> tuple[dict[str, object], ...]:
with database.connection_context():
sources = tuple(Source.select().order_by(Source.created_at.desc()))
source_ids = tuple(int(source.get_id()) for source in sources)
if not source_ids:
return ()
jobs = {
job.source_id: job for job in Job.select().where(Job.source.in_(source_ids))
}
feed_configs = {
config.source_id: config
for config in SourceFeed.select().where(SourceFeed.source.in_(source_ids))
}
pangea_configs = {
config.source_id: config
for config in SourcePangea.select().where(
SourcePangea.source.in_(source_ids)
)
}
return tuple(
_project_source(source, jobs, feed_configs, pangea_configs)
for source in sources
)
def _project_source(
source: "Source",
jobs: dict[int, "Job"],
feed_configs: dict[int, "SourceFeed"],
pangea_configs: dict[int, "SourcePangea"],
) -> dict[str, object]:
source_id = int(source.get_id())
job = jobs[source_id]
if source.source_type == "feed":
upstream = feed_configs[source_id].feed_url
source_type = "Feed"
else:
pangea = pangea_configs[source_id]
upstream = f"{pangea.domain} / {pangea.category_name}"
source_type = "Pangea"
return {
"name": source.name,
"slug": source.slug,
"source_type": source_type,
"upstream": upstream,
"schedule": (
f"cron: {job.cron_minute} {job.cron_hour} {job.cron_day_of_month} "
f"{job.cron_month} {job.cron_day_of_week}"
),
"last_run": "Never run",
"state": "Enabled" if job.enabled else "Disabled",
"state_tone": "scheduled" if job.enabled else "idle",
}
class BaseModel(Model):
class Meta:
database = database
class AppSetting(BaseModel):
key = TextField(primary_key=True)
value = TextField()
class Meta:
table_name = "app_setting"
class Source(BaseModel):
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
name = TextField()
slug = TextField(unique=True)
source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")])
notes = TextField(default="")
class Meta:
table_name = "source"
class SourceFeed(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="feed_config")
feed_url = TextField()
etag = TextField(null=True)
last_modified = TextField(null=True)
class Meta:
table_name = "source_feed"
class SourcePangea(BaseModel):
source = ForeignKeyField(Source, primary_key=True, backref="pangea_config")
domain = TextField()
category_name = TextField()
content_type = TextField()
only_newest = BooleanField()
max_articles = IntegerField()
oldest_article = IntegerField()
include_authors = BooleanField()
exclude_media = BooleanField()
include_content = BooleanField()
content_format = TextField()
class Meta:
table_name = "source_pangea"
class Job(BaseModel):
source = ForeignKeyField(Source, unique=True, backref="job")
created_at = DateTimeField(default=utc_now)
updated_at = DateTimeField(default=utc_now)
enabled = BooleanField()
convert_images = BooleanField(default=True)
convert_video = BooleanField(default=True)
spider_arguments = TextField(default="")
cron_minute = TextField()
cron_hour = TextField()
cron_day_of_month = TextField()
cron_day_of_week = TextField()
cron_month = TextField()
class Meta:
table_name = "job"
class JobExecution(BaseModel):
job = ForeignKeyField(Job, backref="executions")
created_at = DateTimeField(default=utc_now)
started_at = DateTimeField(null=True)
ended_at = DateTimeField(null=True)
stop_requested_at = DateTimeField(null=True)
running_status = IntegerField(
default=JobExecutionStatus.PENDING,
constraints=[Check("running_status BETWEEN 0 AND 4")],
)
requests_count = IntegerField(default=0)
items_count = IntegerField(default=0)
warnings_count = IntegerField(default=0)
errors_count = IntegerField(default=0)
bytes_count = IntegerField(default=0)
retries_count = IntegerField(default=0)
exceptions_count = IntegerField(default=0)
cache_size_count = IntegerField(default=0)
cache_object_count = IntegerField(default=0)
raw_stats = TextField(default="{}")
class Meta:
table_name = "job_execution"