republisher/repub/model.py

from __future__ import annotations

import json
import os
from datetime import UTC, datetime
from enum import IntEnum
from importlib import resources
from importlib.resources.abc import Traversable
from pathlib import Path
from typing import Any

from peewee import (
    BooleanField,
    Check,
    DateTimeField,
    ForeignKeyField,
    IntegerField,
    Model,
    SqliteDatabase,
    TextField,
)
from playhouse.migrate import SchemaMigrator, migrate

DEFAULT_DB_PATH = Path("republisher.db")
DATABASE_PRAGMAS = {
    "busy_timeout": 5000,
    "cache_size": 15625,
    "foreign_keys": 1,
    "journal_mode": "wal",
    "page_size": 4096,
    "synchronous": "normal",
    "temp_store": "memory",
}
SCHEMA_GLOB = "*.sql"
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
DEFAULT_MAX_CONCURRENT_JOBS = 1
FEED_URL_SETTING_KEY = "feed_url"
DEFAULT_FEED_URL = ""

database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)


class JobExecutionStatus(IntEnum):
    PENDING = 0
    RUNNING = 1
    SUCCEEDED = 2
    FAILED = 3
    CANCELED = 4


def utc_now() -> datetime:
    return datetime.now(UTC)


def resolve_database_path(db_path: str | Path | None = None) -> Path:
    raw_value = (
        os.environ.get("REPUBLISHER_DB_PATH", DEFAULT_DB_PATH)
        if db_path is None
        else db_path
    )
    raw_path = Path(raw_value)
    return raw_path.expanduser().resolve()


def schema_paths() -> tuple[Traversable, ...]:
    schema_dir = resources.files("repub").joinpath("sql")
    return tuple(
        sorted(
            (path for path in schema_dir.iterdir() if path.name.endswith(".sql")),
            key=lambda path: path.name,
        )
    )


def initialize_database(db_path: str | Path | None = None) -> Path:
    resolved_path = resolve_database_path(db_path)
    resolved_path.parent.mkdir(parents=True, exist_ok=True)

    if not database.is_closed():
        database.close()

    database.init(str(resolved_path), pragmas=DATABASE_PRAGMAS)
    database.connect(reuse_if_open=True)
    try:
        for path in schema_paths():
            database.connection().executescript(path.read_text(encoding="utf-8"))
        _run_legacy_migrations()
    finally:
        database.close()

    return resolved_path


def _run_legacy_migrations() -> None:
    job_columns = {column.name for column in database.get_columns("job")}
    operations = []
    migrator = SchemaMigrator.from_database(database)
    if "convert_images" not in job_columns:
        operations.extend(
            (
                migrator.add_column(
                    "job",
                    "convert_images",
                    BooleanField(
                        default=True,
                        constraints=[Check("convert_images IN (0, 1)")],
                    ),
                ),
                migrator.add_column_default("job", "convert_images", 1),
            )
        )
    if "convert_video" not in job_columns:
        operations.extend(
            (
                migrator.add_column(
                    "job",
                    "convert_video",
                    BooleanField(
                        default=True,
                        constraints=[Check("convert_video IN (0, 1)")],
                    ),
                ),
                migrator.add_column_default("job", "convert_video", 1),
            )
        )
    if operations:
        with database.atomic():
            migrate(*operations)


def source_slug_exists(slug: str) -> bool:
    with database.connection_context():
        return Source.select().where(Source.slug == slug).exists()


def save_setting(key: str, value: Any) -> None:
    payload = json.dumps(value, sort_keys=True)
    with database.connection_context():
        with database.atomic():
            setting = AppSetting.get_or_none(AppSetting.key == key)
            if setting is None:
                AppSetting.create(key=key, value=payload)
                return
            setting.value = payload
            setting.save()


def load_setting(key: str, default: Any) -> Any:
    with database.connection_context():
        setting = AppSetting.get_or_none(AppSetting.key == key)
    if setting is None:
        return default
    try:
        return json.loads(setting.value)
    except json.JSONDecodeError:
        return default


def load_max_concurrent_jobs() -> int:
    value = load_setting(MAX_CONCURRENT_JOBS_SETTING_KEY, DEFAULT_MAX_CONCURRENT_JOBS)
    try:
        parsed = int(value)
    except (TypeError, ValueError):
        return DEFAULT_MAX_CONCURRENT_JOBS
    return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS


def load_feed_url() -> str:
    value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
    return value if isinstance(value, str) else DEFAULT_FEED_URL


def load_settings_form() -> dict[str, object]:
    return {
        "max_concurrent_jobs": load_max_concurrent_jobs(),
        "feed_url": load_feed_url(),
    }


def load_source_form(slug: str) -> dict[str, object] | None:
    with database.connection_context():
        source = Source.get_or_none(Source.slug == slug)
        if source is None:
            return None

        job = Job.get(Job.source == source)
        form_data: dict[str, object] = {
            "name": source.name,
            "slug": source.slug,
            "source_type": source.source_type,
            "notes": source.notes,
            "spider_arguments": job.spider_arguments,
            "enabled": job.enabled,
            "convert_images": job.convert_images,
            "convert_video": job.convert_video,
            "cron_minute": job.cron_minute,
            "cron_hour": job.cron_hour,
            "cron_day_of_month": job.cron_day_of_month,
            "cron_day_of_week": job.cron_day_of_week,
            "cron_month": job.cron_month,
            "feed_url": "",
            "pangea_domain": "",
            "pangea_category": "",
            "content_format": "MOBILE_3",
            "content_type": "articles",
            "max_articles": "10",
            "oldest_article": "3",
            "only_newest": True,
            "include_authors": True,
            "exclude_media": False,
            "include_content": True,
        }
        if source.source_type == "feed":
            feed = SourceFeed.get(SourceFeed.source == source)
            form_data["feed_url"] = feed.feed_url
        else:
            pangea = SourcePangea.get(SourcePangea.source == source)
            form_data.update(
                {
                    "pangea_domain": pangea.domain,
                    "pangea_category": pangea.category_name,
                    "content_format": pangea.content_format,
                    "content_type": pangea.content_type,
                    "max_articles": str(pangea.max_articles),
                    "oldest_article": str(pangea.oldest_article),
                    "only_newest": pangea.only_newest,
                    "include_authors": pangea.include_authors,
                    "exclude_media": pangea.exclude_media,
                    "include_content": pangea.include_content,
                }
            )
        return form_data


def create_source(
    *,
    name: str,
    slug: str,
    source_type: str,
    notes: str,
    spider_arguments: str,
    enabled: bool,
    cron_minute: str,
    cron_hour: str,
    cron_day_of_month: str,
    cron_day_of_week: str,
    cron_month: str,
    convert_images: bool = True,
    convert_video: bool = True,
    feed_url: str = "",
    pangea_domain: str = "",
    pangea_category: str = "",
    content_type: str = "",
    only_newest: bool = True,
    max_articles: int | None = None,
    oldest_article: int | None = None,
    include_authors: bool = True,
    exclude_media: bool = False,
    include_content: bool = True,
    content_format: str = "",
) -> Source:
    with database.connection_context():
        with database.atomic():
            source = Source.create(
                name=name,
                slug=slug,
                source_type=source_type,
                notes=notes,
            )
            if source_type == "feed":
                SourceFeed.create(
                    source=source,
                    feed_url=feed_url,
                )
            else:
                SourcePangea.create(
                    source=source,
                    domain=pangea_domain,
                    category_name=pangea_category,
                    content_type=content_type,
                    only_newest=only_newest,
                    max_articles=max_articles,
                    oldest_article=oldest_article,
                    include_authors=include_authors,
                    exclude_media=exclude_media,
                    include_content=include_content,
                    content_format=content_format,
                )
            Job.create(
                source=source,
                enabled=enabled,
                convert_images=convert_images,
                convert_video=convert_video,
                spider_arguments=spider_arguments,
                cron_minute=cron_minute,
                cron_hour=cron_hour,
                cron_day_of_month=cron_day_of_month,
                cron_day_of_week=cron_day_of_week,
                cron_month=cron_month,
            )
            return source


def update_source(
    source_slug: str,
    *,
    name: str,
    slug: str,
    source_type: str,
    notes: str,
    spider_arguments: str,
    enabled: bool,
    cron_minute: str,
    cron_hour: str,
    cron_day_of_month: str,
    cron_day_of_week: str,
    cron_month: str,
    convert_images: bool = True,
    convert_video: bool = True,
    feed_url: str = "",
    pangea_domain: str = "",
    pangea_category: str = "",
    content_type: str = "",
    only_newest: bool = True,
    max_articles: int | None = None,
    oldest_article: int | None = None,
    include_authors: bool = True,
    exclude_media: bool = False,
    include_content: bool = True,
    content_format: str = "",
) -> Source | None:
    with database.connection_context():
        with database.atomic():
            source = Source.get_or_none(Source.slug == source_slug)
            if source is None:
                return None

            source.name = name
            source.notes = notes
            source.source_type = source_type
            source.save()

            job = Job.get(Job.source == source)
            job.enabled = enabled
            job.convert_images = convert_images
            job.convert_video = convert_video
            job.spider_arguments = spider_arguments
            job.cron_minute = cron_minute
            job.cron_hour = cron_hour
            job.cron_day_of_month = cron_day_of_month
            job.cron_day_of_week = cron_day_of_week
            job.cron_month = cron_month
            job.save()

            if source_type == "feed":
                SourcePangea.delete().where(SourcePangea.source == source).execute()
                feed = SourceFeed.get_or_none(SourceFeed.source == source)
                if feed is None:
                    SourceFeed.create(source=source, feed_url=feed_url)
                else:
                    feed.feed_url = feed_url
                    feed.save()
            else:
                SourceFeed.delete().where(SourceFeed.source == source).execute()
                pangea = SourcePangea.get_or_none(SourcePangea.source == source)
                if pangea is None:
                    SourcePangea.create(
                        source=source,
                        domain=pangea_domain,
                        category_name=pangea_category,
                        content_type=content_type,
                        only_newest=only_newest,
                        max_articles=max_articles,
                        oldest_article=oldest_article,
                        include_authors=include_authors,
                        exclude_media=exclude_media,
                        include_content=include_content,
                        content_format=content_format,
                    )
                else:
                    pangea.domain = pangea_domain
                    pangea.category_name = pangea_category
                    pangea.content_type = content_type
                    pangea.only_newest = only_newest
                    pangea.max_articles = max_articles
                    pangea.oldest_article = oldest_article
                    pangea.include_authors = include_authors
                    pangea.exclude_media = exclude_media
                    pangea.include_content = include_content
                    pangea.content_format = content_format
                    pangea.save()

            return source


def delete_job_source(job_id: int) -> bool:
    with database.connection_context():
        with database.atomic():
            job = Job.get_or_none(id=job_id)
            if job is None:
                return False
            source = Source.get_by_id(job.source_id)
            return source.delete_instance() > 0


def delete_source(slug: str) -> bool:
    with database.connection_context():
        with database.atomic():
            source = Source.get_or_none(Source.slug == slug)
            if source is None:
                return False
            return source.delete_instance() > 0


def load_sources() -> tuple[dict[str, object], ...]:
    with database.connection_context():
        sources = tuple(Source.select().order_by(Source.created_at.desc()))
        source_ids = tuple(int(source.get_id()) for source in sources)
        if not source_ids:
            return ()
        jobs = {
            job.source_id: job for job in Job.select().where(Job.source.in_(source_ids))
        }
        feed_configs = {
            config.source_id: config
            for config in SourceFeed.select().where(SourceFeed.source.in_(source_ids))
        }
        pangea_configs = {
            config.source_id: config
            for config in SourcePangea.select().where(
                SourcePangea.source.in_(source_ids)
            )
        }
        return tuple(
            _project_source(source, jobs, feed_configs, pangea_configs)
            for source in sources
        )


def _project_source(
    source: "Source",
    jobs: dict[int, "Job"],
    feed_configs: dict[int, "SourceFeed"],
    pangea_configs: dict[int, "SourcePangea"],
) -> dict[str, object]:
    source_id = int(source.get_id())
    job = jobs[source_id]
    if source.source_type == "feed":
        upstream = feed_configs[source_id].feed_url
        source_type = "Feed"
    else:
        pangea = pangea_configs[source_id]
        upstream = f"{pangea.domain} / {pangea.category_name}"
        source_type = "Pangea"

    return {
        "name": source.name,
        "slug": source.slug,
        "source_type": source_type,
        "upstream": upstream,
        "schedule": (
            f"cron: {job.cron_minute} {job.cron_hour} {job.cron_day_of_month} "
            f"{job.cron_month} {job.cron_day_of_week}"
        ),
        "last_run": "Never run",
        "state": "Enabled" if job.enabled else "Disabled",
        "state_tone": "scheduled" if job.enabled else "idle",
    }


class BaseModel(Model):
    class Meta:
        database = database


class AppSetting(BaseModel):
    key = TextField(primary_key=True)
    value = TextField()

    class Meta:
        table_name = "app_setting"


class Source(BaseModel):
    created_at = DateTimeField(default=utc_now)
    updated_at = DateTimeField(default=utc_now)
    name = TextField()
    slug = TextField(unique=True)
    source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")])
    notes = TextField(default="")

    class Meta:
        table_name = "source"


class SourceFeed(BaseModel):
    source = ForeignKeyField(Source, primary_key=True, backref="feed_config")
    feed_url = TextField()
    etag = TextField(null=True)
    last_modified = TextField(null=True)

    class Meta:
        table_name = "source_feed"


class SourcePangea(BaseModel):
    source = ForeignKeyField(Source, primary_key=True, backref="pangea_config")
    domain = TextField()
    category_name = TextField()
    content_type = TextField()
    only_newest = BooleanField()
    max_articles = IntegerField()
    oldest_article = IntegerField()
    include_authors = BooleanField()
    exclude_media = BooleanField()
    include_content = BooleanField()
    content_format = TextField()

    class Meta:
        table_name = "source_pangea"


class Job(BaseModel):
    source = ForeignKeyField(Source, unique=True, backref="job")
    created_at = DateTimeField(default=utc_now)
    updated_at = DateTimeField(default=utc_now)
    enabled = BooleanField()
    convert_images = BooleanField(default=True)
    convert_video = BooleanField(default=True)
    spider_arguments = TextField(default="")
    cron_minute = TextField()
    cron_hour = TextField()
    cron_day_of_month = TextField()
    cron_day_of_week = TextField()
    cron_month = TextField()

    class Meta:
        table_name = "job"


class JobExecution(BaseModel):
    job = ForeignKeyField(Job, backref="executions")
    created_at = DateTimeField(default=utc_now)
    started_at = DateTimeField(null=True)
    ended_at = DateTimeField(null=True)
    stop_requested_at = DateTimeField(null=True)
    running_status = IntegerField(
        default=JobExecutionStatus.PENDING,
        constraints=[Check("running_status BETWEEN 0 AND 4")],
    )
    requests_count = IntegerField(default=0)
    items_count = IntegerField(default=0)
    warnings_count = IntegerField(default=0)
    errors_count = IntegerField(default=0)
    bytes_count = IntegerField(default=0)
    retries_count = IntegerField(default=0)
    exceptions_count = IntegerField(default=0)
    cache_size_count = IntegerField(default=0)
    cache_object_count = IntegerField(default=0)
    raw_stats = TextField(default="{}")

    class Meta:
        table_name = "job_execution"