Fix feed validation output
This commit is contained in:
parent
c834c3c254
commit
db1d9b44b7
13 changed files with 477 additions and 54 deletions
|
|
@ -29,6 +29,7 @@ from repub.model import (
|
|||
SourcePangea,
|
||||
database,
|
||||
initialize_database,
|
||||
load_feed_url,
|
||||
)
|
||||
from repub.spiders.rss_spider import RssFeedSpider
|
||||
|
||||
|
|
@ -271,6 +272,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||
stats_path=stats_path,
|
||||
convert_images=source_config.convert_images,
|
||||
convert_video=source_config.convert_video,
|
||||
feed_url=load_feed_url(),
|
||||
)
|
||||
)
|
||||
print(
|
||||
|
|
@ -424,7 +426,10 @@ def _build_crawl_settings(
|
|||
stats_path: Path,
|
||||
convert_images: bool = True,
|
||||
convert_video: bool = True,
|
||||
feed_url: str | None = None,
|
||||
):
|
||||
if feed_url is None or feed_url.strip() == "":
|
||||
raise ValueError("feed_url setting is required for job runs")
|
||||
base_settings = build_base_settings(
|
||||
RepublisherConfig(
|
||||
config_path=out_dir / "job-runner.toml",
|
||||
|
|
@ -448,6 +453,7 @@ def _build_crawl_settings(
|
|||
priority="cmdline",
|
||||
)
|
||||
settings.set("REPUB_JOB_STATS_PATH", str(stats_path), priority="cmdline")
|
||||
settings.set("REPUBLISHER_FEED_URL", feed_url, priority="cmdline")
|
||||
return settings
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ DATABASE_PRAGMAS = {
|
|||
SCHEMA_GLOB = "*.sql"
|
||||
MAX_CONCURRENT_JOBS_SETTING_KEY = "max_concurrent_jobs"
|
||||
DEFAULT_MAX_CONCURRENT_JOBS = 1
|
||||
FEED_URL_SETTING_KEY = "feed_url"
|
||||
DEFAULT_FEED_URL = ""
|
||||
|
||||
database = SqliteDatabase(None, pragmas=DATABASE_PRAGMAS)
|
||||
|
||||
|
|
@ -163,8 +165,16 @@ def load_max_concurrent_jobs() -> int:
|
|||
return parsed if parsed >= 1 else DEFAULT_MAX_CONCURRENT_JOBS
|
||||
|
||||
|
||||
def load_feed_url() -> str:
|
||||
value = load_setting(FEED_URL_SETTING_KEY, DEFAULT_FEED_URL)
|
||||
return value if isinstance(value, str) else DEFAULT_FEED_URL
|
||||
|
||||
|
||||
def load_settings_form() -> dict[str, object]:
|
||||
return {"max_concurrent_jobs": load_max_concurrent_jobs()}
|
||||
return {
|
||||
"max_concurrent_jobs": load_max_concurrent_jobs(),
|
||||
"feed_url": load_feed_url(),
|
||||
}
|
||||
|
||||
|
||||
def load_source_form(slug: str) -> dict[str, object] | None:
|
||||
|
|
|
|||
|
|
@ -41,7 +41,8 @@ def settings_page(
|
|||
"data-signals": "{_formError: '', _formSuccess: ''}",
|
||||
"data-signals__ifmissing": (
|
||||
"{"
|
||||
f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}'"
|
||||
f"maxConcurrentJobs: '{_value(settings, 'max_concurrent_jobs', '1')}', "
|
||||
f"feedUrl: '{_value(settings, 'feed_url')}'"
|
||||
"}"
|
||||
),
|
||||
"data-on:submit": f"@post('{action_path}')",
|
||||
|
|
@ -74,6 +75,14 @@ def settings_page(
|
|||
help_text="Must be an integer greater than or equal to 1.",
|
||||
signal_name="maxConcurrentJobs",
|
||||
),
|
||||
input_field(
|
||||
label="Feed URL",
|
||||
field_id="feed-domain",
|
||||
value=_value(settings, "feed_url"),
|
||||
placeholder="https://mirror.example",
|
||||
help_text="Example: http://localhost:8080. Must include http:// or https:// and point at the public base URL that serves /feeds/.",
|
||||
signal_name="feedUrl",
|
||||
),
|
||||
],
|
||||
h.div(class_="flex flex-wrap justify-end gap-3 pt-2")[
|
||||
muted_action_link(href="/", label="Back to dashboard"),
|
||||
|
|
|
|||
53
repub/rss.py
53
repub/rss.py
|
|
@ -1,5 +1,7 @@
|
|||
from datetime import datetime
|
||||
from time import mktime
|
||||
import re
|
||||
from calendar import timegm
|
||||
from datetime import UTC, datetime
|
||||
from email.utils import format_datetime
|
||||
|
||||
import lxml.etree as ET
|
||||
import lxml.html
|
||||
|
|
@ -93,20 +95,54 @@ def serialize(root):
|
|||
|
||||
def date_format(d):
|
||||
if d:
|
||||
return d.strftime("%a, %d %b %Y %H:%M:%S %z")
|
||||
return format_datetime(d.astimezone(UTC))
|
||||
|
||||
|
||||
def to_datetime(struct_time):
|
||||
if struct_time:
|
||||
return datetime.fromtimestamp(mktime(struct_time))
|
||||
return datetime.fromtimestamp(timegm(struct_time), tz=UTC)
|
||||
|
||||
|
||||
def normalize_date(struct_time):
|
||||
return date_format(to_datetime(struct_time))
|
||||
|
||||
|
||||
HTML_ATTRIBUTE_DENYLIST = frozenset({"contenteditable", "mode", "querystring"})
|
||||
|
||||
|
||||
def parse_html_fragment(raw_html):
|
||||
if raw_html.strip() == "":
|
||||
return None
|
||||
return lxml.html.fragment_fromstring(raw_html, create_parent=True)
|
||||
|
||||
|
||||
def sanitize_html(raw_html: str) -> str:
|
||||
fragment = parse_html_fragment(raw_html)
|
||||
if fragment is None:
|
||||
return raw_html
|
||||
for el in fragment.iter():
|
||||
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||
el.attrib.pop(attr, None)
|
||||
return (fragment.text or "") + "".join(
|
||||
lxml.html.tostring(child, encoding="unicode") for child in fragment
|
||||
)
|
||||
|
||||
|
||||
def plain_text_summary(raw_html: str | None, max_length: int = 4000) -> str | None:
|
||||
if raw_html is None:
|
||||
return None
|
||||
fragment = parse_html_fragment(raw_html)
|
||||
text = raw_html if fragment is None else fragment.text_content()
|
||||
normalized = re.sub(r"\s+", " ", text).strip()
|
||||
if normalized == "":
|
||||
return None
|
||||
return normalized[:max_length]
|
||||
|
||||
|
||||
def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
||||
html = lxml.html.fromstring(raw_html)
|
||||
html = parse_html_fragment(raw_html)
|
||||
if html is None:
|
||||
return raw_html
|
||||
for el, attr, link, pos in html.iterlinks():
|
||||
if attr == "srcset":
|
||||
# these are a messy special case
|
||||
|
|
@ -133,4 +169,9 @@ def munge_cdata_html(raw_html, replace_link_fn) -> str:
|
|||
else:
|
||||
new = cur[:pos] + new_link + cur[pos + len(link) :]
|
||||
el.set(attr, new)
|
||||
return lxml.html.tostring(html, encoding="utf-8", pretty_print=True).decode("utf-8")
|
||||
for el in html.iter():
|
||||
for attr in HTML_ATTRIBUTE_DENYLIST:
|
||||
el.attrib.pop(attr, None)
|
||||
return (html.text or "") + "".join(
|
||||
lxml.html.tostring(child, encoding="unicode") for child in html
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,18 @@ from scrapy.spiders import Spider
|
|||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
||||
from repub.rss import (
|
||||
ATOM,
|
||||
CDATA,
|
||||
CONTENT,
|
||||
ITUNES,
|
||||
MEDIA,
|
||||
E,
|
||||
munge_cdata_html,
|
||||
normalize_date,
|
||||
plain_text_summary,
|
||||
sanitize_html,
|
||||
)
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
|
||||
|
||||
|
|
@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider):
|
|||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
return f"{file_dir}/{local_path}"
|
||||
relative_path = f"{file_dir}/{local_path}"
|
||||
return self.absolute_feed_url(relative_path)
|
||||
|
||||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def absolute_feed_url(self, path: str) -> str:
|
||||
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||
if feed_url == "":
|
||||
return path
|
||||
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
|
||||
|
||||
def compact_attrib(self, **attrib):
|
||||
return {
|
||||
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
||||
}
|
||||
|
||||
def itunes_explicit_value(self, value) -> str:
|
||||
if isinstance(value, str):
|
||||
return (
|
||||
"true"
|
||||
if value.strip().lower() in {"true", "yes", "explicit"}
|
||||
else "false"
|
||||
)
|
||||
return "true" if bool(value) else "false"
|
||||
|
||||
def publisher_email(self, feed) -> str | None:
|
||||
publisher_detail = feed.get("publisher_detail")
|
||||
if publisher_detail and publisher_detail.get("email"):
|
||||
return publisher_detail.get("email")
|
||||
publisher = feed.get("publisher")
|
||||
if isinstance(publisher, str) and "@" in publisher:
|
||||
return publisher
|
||||
return None
|
||||
|
||||
def itunes_category(self, feed) -> str:
|
||||
del feed
|
||||
return "News"
|
||||
|
||||
def latest_entry_date(self, feed) -> str | None:
|
||||
published_dates = [
|
||||
normalize_date(entry.get("published_parsed"))
|
||||
for entry in feed.entries
|
||||
if entry.get("published_parsed") is not None
|
||||
]
|
||||
if published_dates:
|
||||
return max(published_dates)
|
||||
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
|
||||
feed.feed.get("published_parsed")
|
||||
)
|
||||
|
||||
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
||||
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
||||
|
||||
|
|
@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider):
|
|||
channel = E.channel(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.description(f.get("description")),
|
||||
E.description(sanitize_html(f.get("description", ""))),
|
||||
E.language(f.get("language")),
|
||||
E.copyright(f.get("copyright")),
|
||||
E.webMaster(f.get("publisher")),
|
||||
E.webMaster(self.WEBMASTER_VALUE),
|
||||
E.generator(f.get("generator")),
|
||||
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
||||
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
||||
E.lastBuildDate(self.latest_entry_date(feed)),
|
||||
ITUNES.explicit(
|
||||
self.itunes_explicit_value(f.get("itunes_explicit", False))
|
||||
),
|
||||
ITUNES.category(text=self.itunes_category(f)),
|
||||
(
|
||||
ITUNES.owner(ITUNES.email(email))
|
||||
if (email := self.publisher_email(f))
|
||||
else None
|
||||
),
|
||||
(
|
||||
ATOM.link(
|
||||
rel="self",
|
||||
href=self.absolute_feed_url("feed.rss"),
|
||||
type="application/rss+xml",
|
||||
)
|
||||
if self.settings.get("REPUBLISHER_FEED_URL")
|
||||
else None
|
||||
),
|
||||
)
|
||||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
|
@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider):
|
|||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(self.rewrite_image_url(f.image.get("href"))),
|
||||
E.description(f.get("description")),
|
||||
E.description(sanitize_html(f.get("description", ""))),
|
||||
)
|
||||
image_urls.append(f.image.get("href"))
|
||||
else:
|
||||
|
|
@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider):
|
|||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(self.rewrite_image_url(f.image.get("url"))),
|
||||
E.description(f.image.get("description")),
|
||||
E.description(sanitize_html(f.image.get("description", ""))),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
|
|
@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
E.description(entry.get("description")),
|
||||
E.description(sanitize_html(entry.get("description", ""))),
|
||||
E.guid(
|
||||
entry.get("id"),
|
||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||
),
|
||||
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.summary(plain_text_summary(entry.get("summary"))),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
ITUNES.image(
|
||||
None,
|
||||
|
|
@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(self.rewrite_file_url(file_type, url)),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
**self.compact_attrib(
|
||||
url=self.rewrite_file_url(file_type, url),
|
||||
length=enc.get("length"),
|
||||
type=enc.get("type"),
|
||||
)
|
||||
)
|
||||
)
|
||||
self.logger.debug(
|
||||
|
|
@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
)
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(self.rewrite_file_url(file_type, media.get("url"))),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
E.expression(media.get("expression")),
|
||||
E.bitrate(media.get("bitrate")),
|
||||
E.framerate(media.get("framerate")),
|
||||
E.samplingrate(media.get("samplingrate")),
|
||||
E.channels(media.get("channels")),
|
||||
E.duration(media.get("duration")),
|
||||
E.height(media.get("height")),
|
||||
E.width(media.get("width")),
|
||||
E.lang(media.get("lang")),
|
||||
**self.compact_attrib(
|
||||
url=self.rewrite_file_url(file_type, media.get("url")),
|
||||
type=media.get("type"),
|
||||
medium=media.get("medium"),
|
||||
isDefault=media.get("isDefault"),
|
||||
expression=media.get("expression"),
|
||||
bitrate=media.get("bitrate"),
|
||||
framerate=media.get("framerate"),
|
||||
samplingrate=media.get("samplingrate"),
|
||||
channels=media.get("channels"),
|
||||
duration=media.get("duration"),
|
||||
height=media.get("height"),
|
||||
width=media.get("width"),
|
||||
lang=media.get("lang"),
|
||||
)
|
||||
)
|
||||
)
|
||||
add_url(file_type, media.get("url"))
|
||||
|
|
@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
video_urls=video_urls,
|
||||
videos=[],
|
||||
)
|
||||
|
||||
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
|
||||
|
|
|
|||
10
repub/web.py
10
repub/web.py
|
|
@ -92,6 +92,7 @@ class SourceFormData(TypedDict):
|
|||
|
||||
class SettingsFormData(TypedDict):
|
||||
max_concurrent_jobs: int
|
||||
feed_url: str
|
||||
|
||||
|
||||
DEFAULT_PANGEA_CONTENT_FORMAT = "MOBILE_3"
|
||||
|
|
@ -293,6 +294,7 @@ def create_app(*, dev_mode: bool = False) -> Quart:
|
|||
|
||||
assert settings is not None
|
||||
save_setting("max_concurrent_jobs", settings["max_concurrent_jobs"])
|
||||
save_setting("feed_url", settings["feed_url"])
|
||||
trigger_refresh(app)
|
||||
return DatastarResponse(SSE.redirect("/settings"))
|
||||
|
||||
|
|
@ -709,11 +711,17 @@ def validate_settings_form(
|
|||
return None, "Missing form data."
|
||||
|
||||
max_concurrent_jobs = _parse_int(_read_string(signals, "maxConcurrentJobs"))
|
||||
feed_url = _read_string(signals, "feedUrl").rstrip("/")
|
||||
if max_concurrent_jobs is None:
|
||||
return None, "Max concurrent jobs must be an integer."
|
||||
if max_concurrent_jobs < 1:
|
||||
return None, "Max concurrent jobs must be at least 1."
|
||||
return {"max_concurrent_jobs": max_concurrent_jobs}, None
|
||||
if feed_url != "" and not _is_valid_url(feed_url):
|
||||
return None, "Feed URL must be a valid URL."
|
||||
return {
|
||||
"max_concurrent_jobs": max_concurrent_jobs,
|
||||
"feed_url": feed_url,
|
||||
}, None
|
||||
|
||||
|
||||
def _read_string(signals: dict[str, object], key: str, *, strip: bool = True) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue