feedify: accept xpath rules passed as parameters
parent
bf3ef586c2
commit
167e3e4a15
|
@ -114,7 +114,8 @@ def pre_worker(url):
|
||||||
|
|
||||||
|
|
||||||
class Builder(object):
|
class Builder(object):
|
||||||
def __init__(self, link, data):
|
def __init__(self, link, data, rule=None):
|
||||||
|
|
||||||
self.link = link
|
self.link = link
|
||||||
self.data = data
|
self.data = data
|
||||||
self.rule = rule
|
self.rule = rule
|
||||||
|
@ -124,7 +125,8 @@ class Builder(object):
|
||||||
if isinstance(self.data, bytes):
|
if isinstance(self.data, bytes):
|
||||||
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
|
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
|
||||||
|
|
||||||
self.rule = get_rule(link)
|
if self.rule is None:
|
||||||
|
self.rule = get_rule(link)
|
||||||
|
|
||||||
if self.rule['mode'] == 'xpath':
|
if self.rule['mode'] == 'xpath':
|
||||||
self.doc = lxml.html.fromstring(self.data)
|
self.doc = lxml.html.fromstring(self.data)
|
||||||
|
|
|
@ -348,7 +348,9 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler('xml', True, delay, options.encoding, not feedify.supported(url)).open(url, timeout=TIMEOUT * 2) # feedify.supported(url) to use full crawler if using feedify
|
con = crawler.custom_handler('xml', True, delay, options.encoding,
|
||||||
|
not feedify.supported(url) or not options.items).open(url, timeout=TIMEOUT * 2)
|
||||||
|
# feedify.supported(url) to use full crawler if using feedify
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
|
|
||||||
except (IOError, HTTPException):
|
except (IOError, HTTPException):
|
||||||
|
@ -360,10 +362,29 @@ def FeedFetch(url, options):
|
||||||
rss = feeds.parse(xml)
|
rss = feeds.parse(xml)
|
||||||
|
|
||||||
elif feedify.supported(url):
|
elif feedify.supported(url):
|
||||||
|
# using config file-based feedify
|
||||||
feed = feedify.Builder(url, xml)
|
feed = feedify.Builder(url, xml)
|
||||||
feed.build()
|
feed.build()
|
||||||
rss = feed.feed
|
rss = feed.feed
|
||||||
|
|
||||||
|
elif options.items:
|
||||||
|
# using argument-based feedify
|
||||||
|
rule = {'items': options.items}
|
||||||
|
rule['mode'] = 'xpath'
|
||||||
|
|
||||||
|
if options.item_title:
|
||||||
|
rule['item_title'] = options.item_title
|
||||||
|
if options.item_link:
|
||||||
|
rule['item_link'] = options.item_link
|
||||||
|
if options.item_content:
|
||||||
|
rule['item_content'] = options.item_content
|
||||||
|
if options.item_time:
|
||||||
|
rule['item_time'] = options.item_time
|
||||||
|
|
||||||
|
feed = feedify.Builder(url, xml, rule)
|
||||||
|
feed.build()
|
||||||
|
rss = feed.feed
|
||||||
|
|
||||||
else:
|
else:
|
||||||
log('random page')
|
log('random page')
|
||||||
log(contenttype)
|
log(contenttype)
|
||||||
|
@ -504,11 +525,14 @@ def cgi_app(environ, start_response):
|
||||||
|
|
||||||
if url.startswith(':'):
|
if url.startswith(':'):
|
||||||
split = url.split('/', 1)
|
split = url.split('/', 1)
|
||||||
options = split[0].split(':')[1:]
|
|
||||||
|
options = split[0].replace('|', '/').split(':')[1:]
|
||||||
|
|
||||||
if len(split) > 1:
|
if len(split) > 1:
|
||||||
url = split[1]
|
url = split[1]
|
||||||
else:
|
else:
|
||||||
url = ''
|
url = ''
|
||||||
|
|
||||||
else:
|
else:
|
||||||
options = []
|
options = []
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue