Add support for JSON APIs in feedify

master
pictuga 2013-10-21 21:28:43 +02:00
parent 1802827d31
commit 1f40f2a099
2 changed files with 133 additions and 40 deletions

View File

@ -3,10 +3,12 @@
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
from fnmatch import fnmatch from fnmatch import fnmatch
import feeds import feeds
import morss
import re import re
import urllib2 import urllib2
import lxml.html import lxml.html
import json
import urlparse import urlparse
def toclass(query): def toclass(query):
@ -29,49 +31,139 @@ def getRule(link):
def supported(link): def supported(link):
return getRule(link) is not False return getRule(link) is not False
def getString(html, expr): def formatString(string, getter, error=False):
matches = html.xpath(toclass(expr)) out = ""
if len(matches): char = string[0]
out = ''
for match in matches:
if isinstance(match, basestring):
out += match
elif isinstance(match, lxml.html.HtmlElement):
out += lxml.html.tostring(match)
return out
else:
return ''
def build(link, data=None): follow = string[1:]
rule = getRule(link)
if rule is False: if char == '"':
return False match = follow.partition('"')
out = match[0]
if len(match) >= 2:
next = match[2]
else:
next = None
elif char == '{':
match = follow.partition('}')
try:
test = formatString(match[0], getter, True)
except ValueError, KeyError:
pass
else:
out = test
next = match[2]
elif char == ' ':
next = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0])
print repr(rawValue)
if not isinstance(rawValue, basestring):
if match[1] is not None:
out = match[1].join(rawValue)
else:
out = ''.join(rawValue)
if not out and error:
raise ValueError
next = match[2]
else:
raise ValueError('bogus string')
if next is not None and len(next):
return out + formatString(next, getter, error)
else:
return out
class Builder(object):
def __init__(self, link, data=None):
self.link = link
if data is None: if data is None:
data = urllib2.urlopen(link).read() data = urllib2.urlopen(link).read()
self.data = data
html = lxml.html.fromstring(data) self.rule = getRule(link)
feed = feeds.FeedParserAtom()
if 'title' in rule: if self.rule['mode'] == 'xpath':
feed.title = getString(html, rule['title']) self.data = morss.decodeHTML(self.data)
self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
if 'items' in rule: self.feed = feeds.FeedParserAtom()
for item in html.xpath(toclass(rule['items'])):
def raw(self, html, expr):
if self.rule['mode'] == 'xpath':
print 1, toclass(expr)
return html.xpath(toclass(expr))
elif self.rule['mode'] == 'json':
a = [html]
b = []
for x in expr.strip(".").split("."):
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
for elem in a:
if isinstance(elem, dict):
kids = elem.get(match[0])
if kids is None:
pass
elif isinstance(kids, list):
[b.append(i) for i in kids]
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>'))
else:
b.append(kids)
if match[1] is None:
a = b
else:
if len(b)-1 >= int(match[1]):
a = [b[int(match[1])]]
else:
a = []
b = []
return a
def strings(self, html, expr):
if self.rule['mode'] == 'xpath':
out = []
for match in self.raw(html, expr):
if isinstance(match, basestring):
out.append(match)
elif isinstance(match, lxml.html.HtmlElement):
out.append(lxml.html.tostring(match))
return out
elif self.rule['mode'] == 'json':
return self.raw(html, expr)
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
self.feed.title = self.string(self.doc, 'title')
if 'items' in self.rule:
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
feedItem = {} feedItem = {}
if 'item_title' in rule: if 'item_title' in self.rule:
feedItem['title'] = getString(item, rule['item_title']) feedItem['title'] = self.string(item, 'item_title')
if 'item_link' in rule: if 'item_link' in self.rule:
url = getString(item, rule['item_link']) url = self.string(item, 'item_link')
url = urlparse.urljoin(link, url) url = urlparse.urljoin(self.link, url)
feedItem['link'] = url feedItem['link'] = url
if 'item_desc' in rule: if 'item_desc' in self.rule:
feedItem['desc'] = getString(item, rule['item_desc']) feedItem['desc'] = self.string(item, 'item_desc')
if 'item_content' in rule: if 'item_content' in self.rule:
feedItem['content'] = getString(item, rule['item_content']) feedItem['content'] = self.string(item, 'item_content')
if 'item_time' in rule: if 'item_time' in self.rule:
feedItem['updated'] = getString(item, rule['item_time']) feedItem['updated'] = self.string(item, 'item_time')
feed.items.append(feedItem) self.feed.items.append(feedItem)
return feed

View File

@ -410,8 +410,9 @@ def Gather(url, cachePath, options):
if style == 'normal': if style == 'normal':
rss = feeds.parse(xml) rss = feeds.parse(xml)
elif style == 'feedify': elif style == 'feedify':
xml = decodeHTML(xml) feed = feedify.Builder(url, xml)
rss = feedify.build(url, xml) feed.build()
rss = feed.feed
elif style == 'html': elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match): if len(match):