Fix feed validation output
This commit is contained in:
parent
c834c3c254
commit
db1d9b44b7
13 changed files with 477 additions and 54 deletions
|
|
@ -7,7 +7,18 @@ from scrapy.spiders import Spider
|
|||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from repub.items import ChannelElementItem, ElementItem
|
||||
from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, munge_cdata_html, normalize_date
|
||||
from repub.rss import (
|
||||
ATOM,
|
||||
CDATA,
|
||||
CONTENT,
|
||||
ITUNES,
|
||||
MEDIA,
|
||||
E,
|
||||
munge_cdata_html,
|
||||
normalize_date,
|
||||
plain_text_summary,
|
||||
sanitize_html,
|
||||
)
|
||||
from repub.utils import FileType, determine_file_type, local_file_path, local_image_path
|
||||
|
||||
|
||||
|
|
@ -42,11 +53,57 @@ class BaseRssFeedSpider(Spider):
|
|||
file_dir = self.settings["REPUBLISHER_VIDEO_DIR"]
|
||||
elif file_type == FileType.AUDIO:
|
||||
file_dir = self.settings["REPUBLISHER_AUDIO_DIR"]
|
||||
return f"{file_dir}/{local_path}"
|
||||
relative_path = f"{file_dir}/{local_path}"
|
||||
return self.absolute_feed_url(relative_path)
|
||||
|
||||
def rewrite_image_url(self, url):
|
||||
return self.rewrite_file_url(FileType.IMAGE, url)
|
||||
|
||||
def absolute_feed_url(self, path: str) -> str:
|
||||
feed_url = str(self.settings.get("REPUBLISHER_FEED_URL", "")).rstrip("/")
|
||||
if feed_url == "":
|
||||
return path
|
||||
return f"{feed_url}/feeds/{self.feed_name}/{path.lstrip('/')}"
|
||||
|
||||
def compact_attrib(self, **attrib):
|
||||
return {
|
||||
key: str(value) for key, value in attrib.items() if value not in (None, "")
|
||||
}
|
||||
|
||||
def itunes_explicit_value(self, value) -> str:
|
||||
if isinstance(value, str):
|
||||
return (
|
||||
"true"
|
||||
if value.strip().lower() in {"true", "yes", "explicit"}
|
||||
else "false"
|
||||
)
|
||||
return "true" if bool(value) else "false"
|
||||
|
||||
def publisher_email(self, feed) -> str | None:
|
||||
publisher_detail = feed.get("publisher_detail")
|
||||
if publisher_detail and publisher_detail.get("email"):
|
||||
return publisher_detail.get("email")
|
||||
publisher = feed.get("publisher")
|
||||
if isinstance(publisher, str) and "@" in publisher:
|
||||
return publisher
|
||||
return None
|
||||
|
||||
def itunes_category(self, feed) -> str:
|
||||
del feed
|
||||
return "News"
|
||||
|
||||
def latest_entry_date(self, feed) -> str | None:
|
||||
published_dates = [
|
||||
normalize_date(entry.get("published_parsed"))
|
||||
for entry in feed.entries
|
||||
if entry.get("published_parsed") is not None
|
||||
]
|
||||
if published_dates:
|
||||
return max(published_dates)
|
||||
return normalize_date(feed.feed.get("updated_parsed")) or normalize_date(
|
||||
feed.feed.get("published_parsed")
|
||||
)
|
||||
|
||||
def munge_cdata_html(self, html) -> Tuple[str, Dict[FileType, List[str]]]:
|
||||
urls = {FileType.IMAGE: [], FileType.VIDEO: [], FileType.AUDIO: []}
|
||||
|
||||
|
|
@ -100,14 +157,31 @@ class BaseRssFeedSpider(Spider):
|
|||
channel = E.channel(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.description(f.get("description")),
|
||||
E.description(sanitize_html(f.get("description", ""))),
|
||||
E.language(f.get("language")),
|
||||
E.copyright(f.get("copyright")),
|
||||
E.webMaster(f.get("publisher")),
|
||||
E.webMaster(self.WEBMASTER_VALUE),
|
||||
E.generator(f.get("generator")),
|
||||
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
||||
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
||||
E.lastBuildDate(self.latest_entry_date(feed)),
|
||||
ITUNES.explicit(
|
||||
self.itunes_explicit_value(f.get("itunes_explicit", False))
|
||||
),
|
||||
ITUNES.category(text=self.itunes_category(f)),
|
||||
(
|
||||
ITUNES.owner(ITUNES.email(email))
|
||||
if (email := self.publisher_email(f))
|
||||
else None
|
||||
),
|
||||
(
|
||||
ATOM.link(
|
||||
rel="self",
|
||||
href=self.absolute_feed_url("feed.rss"),
|
||||
type="application/rss+xml",
|
||||
)
|
||||
if self.settings.get("REPUBLISHER_FEED_URL")
|
||||
else None
|
||||
),
|
||||
)
|
||||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
|
@ -119,7 +193,7 @@ class BaseRssFeedSpider(Spider):
|
|||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(self.rewrite_image_url(f.image.get("href"))),
|
||||
E.description(f.get("description")),
|
||||
E.description(sanitize_html(f.get("description", ""))),
|
||||
)
|
||||
image_urls.append(f.image.get("href"))
|
||||
else:
|
||||
|
|
@ -127,7 +201,7 @@ class BaseRssFeedSpider(Spider):
|
|||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(self.rewrite_image_url(f.image.get("url"))),
|
||||
E.description(f.image.get("description")),
|
||||
E.description(sanitize_html(f.image.get("description", ""))),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
|
|
@ -205,14 +279,14 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
E.description(entry.get("description")),
|
||||
E.description(sanitize_html(entry.get("description", ""))),
|
||||
E.guid(
|
||||
entry.get("id"),
|
||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||
),
|
||||
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.summary(plain_text_summary(entry.get("summary"))),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
ITUNES.image(
|
||||
None,
|
||||
|
|
@ -230,9 +304,11 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
file_type = determine_file_type(url=url, mimetype=enc.get("type"))
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(self.rewrite_file_url(file_type, url)),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
**self.compact_attrib(
|
||||
url=self.rewrite_file_url(file_type, url),
|
||||
length=enc.get("length"),
|
||||
type=enc.get("type"),
|
||||
)
|
||||
)
|
||||
)
|
||||
self.logger.debug(
|
||||
|
|
@ -261,19 +337,21 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
)
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(self.rewrite_file_url(file_type, media.get("url"))),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
E.expression(media.get("expression")),
|
||||
E.bitrate(media.get("bitrate")),
|
||||
E.framerate(media.get("framerate")),
|
||||
E.samplingrate(media.get("samplingrate")),
|
||||
E.channels(media.get("channels")),
|
||||
E.duration(media.get("duration")),
|
||||
E.height(media.get("height")),
|
||||
E.width(media.get("width")),
|
||||
E.lang(media.get("lang")),
|
||||
**self.compact_attrib(
|
||||
url=self.rewrite_file_url(file_type, media.get("url")),
|
||||
type=media.get("type"),
|
||||
medium=media.get("medium"),
|
||||
isDefault=media.get("isDefault"),
|
||||
expression=media.get("expression"),
|
||||
bitrate=media.get("bitrate"),
|
||||
framerate=media.get("framerate"),
|
||||
samplingrate=media.get("samplingrate"),
|
||||
channels=media.get("channels"),
|
||||
duration=media.get("duration"),
|
||||
height=media.get("height"),
|
||||
width=media.get("width"),
|
||||
lang=media.get("lang"),
|
||||
)
|
||||
)
|
||||
)
|
||||
add_url(file_type, media.get("url"))
|
||||
|
|
@ -289,3 +367,5 @@ class RssFeedSpider(BaseRssFeedSpider):
|
|||
video_urls=video_urls,
|
||||
videos=[],
|
||||
)
|
||||
|
||||
WEBMASTER_VALUE = "support@guardianproject.info (Guardian Project)"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue