Add support for JSON APIs in feedify

master
pictuga 2013-10-21 21:28:43 +02:00
parent 1802827d31
commit 1f40f2a099
2 changed files with 133 additions and 40 deletions

View File

@ -3,10 +3,12 @@
from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import morss
import re
import urllib2
import lxml.html
import json
import urlparse
def toclass(query):
@ -29,49 +31,139 @@ def getRule(link):
def supported(link):
return getRule(link) is not False
def getString(html, expr):
matches = html.xpath(toclass(expr))
if len(matches):
out = ''
for match in matches:
if isinstance(match, basestring):
out += match
elif isinstance(match, lxml.html.HtmlElement):
out += lxml.html.tostring(match)
return out
def formatString(string, getter, error=False):
out = ""
char = string[0]
follow = string[1:]
if char == '"':
match = follow.partition('"')
out = match[0]
if len(match) >= 2:
next = match[2]
else:
next = None
elif char == '{':
match = follow.partition('}')
try:
test = formatString(match[0], getter, True)
except ValueError, KeyError:
pass
else:
out = test
next = match[2]
elif char == ' ':
next = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0])
print repr(rawValue)
if not isinstance(rawValue, basestring):
if match[1] is not None:
out = match[1].join(rawValue)
else:
out = ''.join(rawValue)
if not out and error:
raise ValueError
next = match[2]
else:
return ''
raise ValueError('bogus string')
def build(link, data=None):
rule = getRule(link)
if rule is False:
return False
if next is not None and len(next):
return out + formatString(next, getter, error)
else:
return out
if data is None:
data = urllib2.urlopen(link).read()
class Builder(object):
def __init__(self, link, data=None):
self.link = link
html = lxml.html.fromstring(data)
feed = feeds.FeedParserAtom()
if data is None:
data = urllib2.urlopen(link).read()
self.data = data
if 'title' in rule:
feed.title = getString(html, rule['title'])
self.rule = getRule(link)
if 'items' in rule:
for item in html.xpath(toclass(rule['items'])):
feedItem = {}
if self.rule['mode'] == 'xpath':
self.data = morss.decodeHTML(self.data)
self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
if 'item_title' in rule:
feedItem['title'] = getString(item, rule['item_title'])
if 'item_link' in rule:
url = getString(item, rule['item_link'])
url = urlparse.urljoin(link, url)
feedItem['link'] = url
if 'item_desc' in rule:
feedItem['desc'] = getString(item, rule['item_desc'])
if 'item_content' in rule:
feedItem['content'] = getString(item, rule['item_content'])
if 'item_time' in rule:
feedItem['updated'] = getString(item, rule['item_time'])
self.feed = feeds.FeedParserAtom()
feed.items.append(feedItem)
return feed
def raw(self, html, expr):
if self.rule['mode'] == 'xpath':
print 1, toclass(expr)
return html.xpath(toclass(expr))
elif self.rule['mode'] == 'json':
a = [html]
b = []
for x in expr.strip(".").split("."):
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
for elem in a:
if isinstance(elem, dict):
kids = elem.get(match[0])
if kids is None:
pass
elif isinstance(kids, list):
[b.append(i) for i in kids]
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>'))
else:
b.append(kids)
if match[1] is None:
a = b
else:
if len(b)-1 >= int(match[1]):
a = [b[int(match[1])]]
else:
a = []
b = []
return a
def strings(self, html, expr):
if self.rule['mode'] == 'xpath':
out = []
for match in self.raw(html, expr):
if isinstance(match, basestring):
out.append(match)
elif isinstance(match, lxml.html.HtmlElement):
out.append(lxml.html.tostring(match))
return out
elif self.rule['mode'] == 'json':
return self.raw(html, expr)
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
self.feed.title = self.string(self.doc, 'title')
if 'items' in self.rule:
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
feedItem = {}
if 'item_title' in self.rule:
feedItem['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule:
url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url)
feedItem['link'] = url
if 'item_desc' in self.rule:
feedItem['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule:
feedItem['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule:
feedItem['updated'] = self.string(item, 'item_time')
self.feed.items.append(feedItem)

View File

@ -410,8 +410,9 @@ def Gather(url, cachePath, options):
if style == 'normal':
rss = feeds.parse(xml)
elif style == 'feedify':
xml = decodeHTML(xml)
rss = feedify.build(url, xml)
feed = feedify.Builder(url, xml)
feed.build()
rss = feed.feed
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):