basic feed rebuilding
This commit is contained in:
parent
4ab05c9000
commit
6add19c288
17 changed files with 772 additions and 69 deletions
4
repub/spiders/__init__.py
Normal file
4
repub/spiders/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
168
repub/spiders/rss_spider.py
Normal file
168
repub/spiders/rss_spider.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
from repub.items import (
|
||||
ChannelElementItem,
|
||||
ElementItem,
|
||||
)
|
||||
import feedparser
|
||||
import logging
|
||||
|
||||
from repub.rss import E, ITUNES, CONTENT, MEDIA, CDATA, normalize_date
|
||||
|
||||
|
||||
class BaseRssFeedSpider(Spider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from RSS feeds.
|
||||
"""
|
||||
|
||||
def parse_feed(self, feed_text):
|
||||
parsed = feedparser.parse(feed_text, sanitize_html=False)
|
||||
if parsed.bozo:
|
||||
logging.error(
|
||||
"Bozo feed data. %s: %r",
|
||||
parsed.bozo_exception.__class__.__name__,
|
||||
parsed.bozo_exception,
|
||||
)
|
||||
if hasattr(parsed.bozo_exception, "getLineNumber") and hasattr(
|
||||
parsed.bozo_exception, "getMessage"
|
||||
):
|
||||
line = parsed.bozo_exception.getLineNumber()
|
||||
logging.error("Line %d: %s", line, parsed.bozo_exception.getMessage())
|
||||
segment = feed_text.split("\n")[line - 1]
|
||||
logging.info("Body segment with error: %r", segment)
|
||||
return None
|
||||
return parsed
|
||||
|
||||
def parse_channel_meta(self, response, feed):
|
||||
f = feed.feed
|
||||
channel = E.channel(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.description(f.get("description")),
|
||||
E.language(f.get("language")),
|
||||
E.copyright(f.get("copyright")),
|
||||
E.webMaster(f.get("publisher")),
|
||||
E.generator(f.get("generator")),
|
||||
E.pubDate(normalize_date(f.get("published_parsed"))),
|
||||
E.lastBuildDate(normalize_date(f.get("updated_parsed"))),
|
||||
ITUNES.explicit("yes" if f.get("itunes_explicit", False) else "no"),
|
||||
)
|
||||
for tag in f.get("tags", []):
|
||||
channel.append(E.category(tag.term))
|
||||
|
||||
if "image" in f:
|
||||
if "href" in f.image:
|
||||
image = E.image(
|
||||
E.title(f.get("title")),
|
||||
E.link(f.get("link")),
|
||||
E.url(f.image.get("href")),
|
||||
E.description(f.get("description")),
|
||||
)
|
||||
else:
|
||||
image = E.image(
|
||||
E.title(f.image.get("title")),
|
||||
E.link(f.image.get("link")),
|
||||
E.url(f.image.get("url")),
|
||||
E.description(f.image.get("description")),
|
||||
E.width(f.image.get("width")),
|
||||
E.height(f.image.get("height")),
|
||||
)
|
||||
channel.append(image)
|
||||
return ChannelElementItem(el=channel)
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
response = self.adapt_response(response)
|
||||
feed = self.parse_feed(response.body)
|
||||
if feed and feed.feed:
|
||||
return self.parse_entries(response, feed)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
"""This method must be overridden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_entries(self, response, feed):
|
||||
channel = self.parse_channel_meta(response, feed)
|
||||
yield channel
|
||||
for entry in feed.entries:
|
||||
ret = iterate_spider_output(self.parse_entry(response, feed, entry))
|
||||
yield from self.process_results(response, feed, ret)
|
||||
|
||||
def process_results(self, response, feed, results):
|
||||
"""This overridable method is called for each result (item or request)
|
||||
returned by the spider, and it's intended to perform any last time
|
||||
processing required before returning the results to the framework core,
|
||||
for example setting the item GUIDs. It receives a list of results and
|
||||
the response which originated that results. It must return a list of
|
||||
results (items or requests).
|
||||
"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""You can override this function in order to make any changes you want
|
||||
to into the feed before parsing it. This function must return a
|
||||
response.
|
||||
"""
|
||||
return response
|
||||
|
||||
|
||||
class RssFeedSpider(BaseRssFeedSpider):
|
||||
"""A generic RSS Feed spider"""
|
||||
|
||||
name = "rss_spider"
|
||||
|
||||
def __init__(self, urls, **kwargs):
|
||||
self.start_urls = urls
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def parse_entry(self, response, feed, entry):
|
||||
item = E.item(
|
||||
E.title(entry.get("title")),
|
||||
E.link(entry.get("link")),
|
||||
E.description(entry.get("description")),
|
||||
E.guid(
|
||||
entry.get("id"),
|
||||
{"isPermaLink": "true" if entry.guidislink else "false"},
|
||||
),
|
||||
E.pubDate(normalize_date(entry.get("published_parsed"))),
|
||||
E.author(entry.get("author")),
|
||||
ITUNES.summary(entry.get("summary")),
|
||||
ITUNES.duration(entry.get("itunes_duration")),
|
||||
)
|
||||
for enc in entry.enclosures:
|
||||
item.append(
|
||||
E.enclosure(
|
||||
E.url(enc.get("href")),
|
||||
E.length(enc.get("length")),
|
||||
E.type(enc.get("type")),
|
||||
)
|
||||
)
|
||||
|
||||
if "content" in entry:
|
||||
for c in entry.content:
|
||||
if c.type == "text/html":
|
||||
item.append(CONTENT.encoded(CDATA(c.value)))
|
||||
|
||||
if isinstance(entry.get("media_content"), list):
|
||||
for media in (
|
||||
media for media in entry["media_content"] if media.get("url")
|
||||
):
|
||||
item.append(
|
||||
MEDIA.content(
|
||||
E.url(media.get("url")),
|
||||
E.type(media.get("type")),
|
||||
E.medium(media.get("medium")),
|
||||
E.isDefault(media.get("isDefault")),
|
||||
E.expression(media.get("expression")),
|
||||
E.bitrate(media.get("bitrate")),
|
||||
E.framerate(media.get("framerate")),
|
||||
E.samplingrate(media.get("samplingrate")),
|
||||
E.channels(media.get("channels")),
|
||||
E.duration(media.get("duration")),
|
||||
E.height(media.get("height")),
|
||||
E.width(media.get("width")),
|
||||
E.lang(media.get("lang")),
|
||||
)
|
||||
)
|
||||
return ElementItem(el=item)
|
||||
Loading…
Add table
Add a link
Reference in a new issue