from __future__ import annotations import json from datetime import UTC, datetime from enum import IntEnum from typing import Any from peewee import ( BooleanField, Check, DateTimeField, ForeignKeyField, IntegerField, Model, TextField, ) from repub import db as db_module DEFAULT_DB_PATH = db_module.DEFAULT_DB_PATH DATABASE_PRAGMAS = db_module.DATABASE_PRAGMAS SCHEMA_GLOB = db_module.SCHEMA_GLOB database = db_module.database initialize_database = db_module.initialize_database resolve_database_path = db_module.resolve_database_path schema_paths = db_module.schema_paths MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs" DEFAULT_MAX_CONCURRENT_JOBS = 1 FEED_URL_SETTING_KEY = "feed_url" DEFAULT_FEED_URL = "" class JobExecutionStatus(IntEnum): PENDING = 0 RUNNING = 1 SUCCEEDED = 2 FAILED = 3 CANCELED = 4 def utc_now() -> datetime: return datetime.now(UTC) def source_slug_exists(slug: str) -> bool: with database.reader(): return Source.select().where(Source.slug == slug).exists() def save_setting(key: str, value: Any) -> None: payload = json.dumps(value, sort_keys=True) with database.writer(): setting = AppSetting.get_or_none(AppSetting.key == key) if setting is None: AppSetting.create(key=key, value=payload) return setting.value = payload setting.save() def load_setting(key: str, default: Any) -> Any: with database.reader(): setting = AppSetting.get_or_none(AppSetting.key == key) if setting is None: return default try: return json.loads(setting.value) except json.JSONDecodeError: return default def load_max_concurrent_jobs() -> int: value = load_setting(MAX_CONCURRENT_JOBS_SETTING_KEY, DEFAULT_MAX_CONCURRENT_JOBS) try: parsed = int(value) except (TypeError, ValueError): return DEFAULT_MAX_CONCURRENT_JOBS return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS def load_feed_url() -> str: value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL) return value if isinstance(value, str) else DEFAULT_FEED_URL def load_settings_form() -> dict[str, object]: return { "max_concurrent_jobs": load_max_concurrent_jobs(), "feed_url": load_feed_url(), } def load_job_enabled(job_id: int) -> bool | None: with database.reader(): job = Job.get_or_none(id=job_id) return None if job is None else job.enabled def load_source_form(slug: str) -> dict[str, object] | None: with database.reader(): source = Source.get_or_none(Source.slug == slug) if source is None: return None job = Job.get(Job.source == source) form_data: dict[str, object] = { "name": source.name, "slug": source.slug, "source_type": source.source_type, "notes": source.notes, "spider_arguments": job.spider_arguments, "enabled": job.enabled, "convert_images": job.convert_images, "convert_video": job.convert_video, "cron_minute": job.cron_minute, "cron_hour": job.cron_hour, "cron_day_of_month": job.cron_day_of_month, "cron_day_of_week": job.cron_day_of_week, "cron_month": job.cron_month, "feed_url": "", "pangea_domain": "", "pangea_category": "", "content_format": "MOBILE_3", "content_type": "articles", "max_articles": "10", "oldest_article": "3", "only_newest": True, "include_authors": True, "exclude_media": False, "include_content": True, } if source.source_type == "feed": feed = SourceFeed.get(SourceFeed.source == source) form_data["feed_url"] = feed.feed_url else: pangea = SourcePangea.get(SourcePangea.source == source) form_data.update( { "pangea_domain": pangea.domain, "pangea_category": pangea.category_name, "content_format": pangea.content_format, "content_type": pangea.content_type, "max_articles": str(pangea.max_articles), "oldest_article": str(pangea.oldest_article), "only_newest": pangea.only_newest, "include_authors": pangea.include_authors, "exclude_media": pangea.exclude_media, "include_content": pangea.include_content, } ) return form_data def create_source( *, name: str, slug: str, source_type: str, notes: str, spider_arguments: str, enabled: bool, cron_minute: str, cron_hour: str, cron_day_of_month: str, cron_day_of_week: str, cron_month: str, convert_images: bool = True, convert_video: bool = True, feed_url: str = "", pangea_domain: str = "", pangea_category: str = "", content_type: str = "", only_newest: bool = True, max_articles: int | None = None, oldest_article: int | None = None, include_authors: bool = True, exclude_media: bool = False, include_content: bool = True, content_format: str = "", ) -> Source: with database.writer(): source = Source.create( name=name, slug=slug, source_type=source_type, notes=notes, ) if source_type == "feed": SourceFeed.create( source=source, feed_url=feed_url, ) else: SourcePangea.create( source=source, domain=pangea_domain, category_name=pangea_category, content_type=content_type, only_newest=only_newest, max_articles=max_articles, oldest_article=oldest_article, include_authors=include_authors, exclude_media=exclude_media, include_content=include_content, content_format=content_format, ) Job.create( source=source, enabled=enabled, convert_images=convert_images, convert_video=convert_video, spider_arguments=spider_arguments, cron_minute=cron_minute, cron_hour=cron_hour, cron_day_of_month=cron_day_of_month, cron_day_of_week=cron_day_of_week, cron_month=cron_month, ) return source def update_source( source_slug: str, *, name: str, slug: str, source_type: str, notes: str, spider_arguments: str, enabled: bool, cron_minute: str, cron_hour: str, cron_day_of_month: str, cron_day_of_week: str, cron_month: str, convert_images: bool = True, convert_video: bool = True, feed_url: str = "", pangea_domain: str = "", pangea_category: str = "", content_type: str = "", only_newest: bool = True, max_articles: int | None = None, oldest_article: int | None = None, include_authors: bool = True, exclude_media: bool = False, include_content: bool = True, content_format: str = "", ) -> Source | None: with database.writer(): source = Source.get_or_none(Source.slug == source_slug) if source is None: return None source.name = name source.notes = notes source.source_type = source_type source.save() job = Job.get(Job.source == source) job.enabled = enabled job.convert_images = convert_images job.convert_video = convert_video job.spider_arguments = spider_arguments job.cron_minute = cron_minute job.cron_hour = cron_hour job.cron_day_of_month = cron_day_of_month job.cron_day_of_week = cron_day_of_week job.cron_month = cron_month job.save() if source_type == "feed": SourcePangea.delete().where(SourcePangea.source == source).execute() feed = SourceFeed.get_or_none(SourceFeed.source == source) if feed is None: SourceFeed.create(source=source, feed_url=feed_url) else: feed.feed_url = feed_url feed.save() else: SourceFeed.delete().where(SourceFeed.source == source).execute() pangea = SourcePangea.get_or_none(SourcePangea.source == source) if pangea is None: SourcePangea.create( source=source, domain=pangea_domain, category_name=pangea_category, content_type=content_type, only_newest=only_newest, max_articles=max_articles, oldest_article=oldest_article, include_authors=include_authors, exclude_media=exclude_media, include_content=include_content, content_format=content_format, ) else: pangea.domain = pangea_domain pangea.category_name = pangea_category pangea.content_type = content_type pangea.only_newest = only_newest pangea.max_articles = max_articles pangea.oldest_article = oldest_article pangea.include_authors = include_authors pangea.exclude_media = exclude_media pangea.include_content = include_content pangea.content_format = content_format pangea.save() return source def delete_job_source(job_id: int) -> bool: with database.writer(): job = Job.get_or_none(id=job_id) if job is None: return False source = Source.get_by_id(job.source_id) return source.delete_instance() > 0 def delete_source(slug: str) -> bool: with database.writer(): source = Source.get_or_none(Source.slug == slug) if source is None: return False return source.delete_instance() > 0 def load_sources() -> tuple[dict[str, object], ...]: with database.reader(): sources = tuple(Source.select().order_by(Source.created_at.desc())) source_ids = tuple(int(source.get_id()) for source in sources) if not source_ids: return () jobs = { job.source_id: job for job in Job.select().where(Job.source.in_(source_ids)) } feed_configs = { config.source_id: config for config in SourceFeed.select().where(SourceFeed.source.in_(source_ids)) } pangea_configs = { config.source_id: config for config in SourcePangea.select().where( SourcePangea.source.in_(source_ids) ) } return tuple( _project_source(source, jobs, feed_configs, pangea_configs) for source in sources ) def _project_source( source: "Source", jobs: dict[int, "Job"], feed_configs: dict[int, "SourceFeed"], pangea_configs: dict[int, "SourcePangea"], ) -> dict[str, object]: source_id = int(source.get_id()) job = jobs[source_id] if source.source_type == "feed": upstream = feed_configs[source_id].feed_url source_type = "Feed" else: pangea = pangea_configs[source_id] upstream = f"{pangea.domain} / {pangea.category_name}" source_type = "Pangea" return { "name": source.name, "slug": source.slug, "source_type": source_type, "upstream": upstream, "schedule": ( f"cron: {job.cron_minute} {job.cron_hour} {job.cron_day_of_month} " f"{job.cron_month} {job.cron_day_of_week}" ), "last_run": "Never run", "state": "Enabled" if job.enabled else "Disabled", "state_tone": "scheduled" if job.enabled else "idle", } class BaseModel(Model): class Meta: database = database class AppSetting(BaseModel): key = TextField(primary_key=True) value = TextField() class Meta: table_name = "app_setting" class Source(BaseModel): created_at = DateTimeField(default=utc_now) updated_at = DateTimeField(default=utc_now) name = TextField() slug = TextField(unique=True) source_type = TextField(constraints=[Check("source_type IN ('feed', 'pangea')")]) notes = TextField(default="") class Meta: table_name = "source" class SourceFeed(BaseModel): source = ForeignKeyField(Source, primary_key=True, backref="feed_config") feed_url = TextField() etag = TextField(null=True) last_modified = TextField(null=True) class Meta: table_name = "source_feed" class SourcePangea(BaseModel): source = ForeignKeyField(Source, primary_key=True, backref="pangea_config") domain = TextField() category_name = TextField() content_type = TextField() only_newest = BooleanField() max_articles = IntegerField() oldest_article = IntegerField() include_authors = BooleanField() exclude_media = BooleanField() include_content = BooleanField() content_format = TextField() class Meta: table_name = "source_pangea" class Job(BaseModel): source = ForeignKeyField(Source, unique=True, backref="job") created_at = DateTimeField(default=utc_now) updated_at = DateTimeField(default=utc_now) enabled = BooleanField() convert_images = BooleanField(default=True) convert_video = BooleanField(default=True) spider_arguments = TextField(default="") cron_minute = TextField() cron_hour = TextField() cron_day_of_month = TextField() cron_day_of_week = TextField() cron_month = TextField() class Meta: table_name = "job" class JobExecution(BaseModel): job = ForeignKeyField(Job, backref="executions") created_at = DateTimeField(default=utc_now) started_at = DateTimeField(null=True) ended_at = DateTimeField(null=True) stop_requested_at = DateTimeField(null=True) running_status = IntegerField( default=JobExecutionStatus.PENDING, constraints=[Check("running_status BETWEEN 0 AND 4")], ) requests_count = IntegerField(default=0) items_count = IntegerField(default=0) warnings_count = IntegerField(default=0) errors_count = IntegerField(default=0) bytes_count = IntegerField(default=0) retries_count = IntegerField(default=0) exceptions_count = IntegerField(default=0) cache_size_count = IntegerField(default=0) cache_object_count = IntegerField(default=0) raw_stats = TextField(default="{}") class Meta: table_name = "job_execution"