Add support for JSON APIs in feedify

master
pictuga 2013-10-21 21:28:43 +02:00
parent 1802827d31
commit 1f40f2a099
2 changed files with 133 additions and 40 deletions

View File

@ -3,10 +3,12 @@
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
from fnmatch import fnmatch from fnmatch import fnmatch
import feeds import feeds
import morss
import re import re
import urllib2 import urllib2
import lxml.html import lxml.html
import json
import urlparse import urlparse
def toclass(query): def toclass(query):
@ -29,49 +31,139 @@ def getRule(link):
def supported(link): def supported(link):
return getRule(link) is not False return getRule(link) is not False
def getString(html, expr): def formatString(string, getter, error=False):
matches = html.xpath(toclass(expr)) out = ""
if len(matches): char = string[0]
out = ''
for match in matches: follow = string[1:]
if isinstance(match, basestring):
out += match if char == '"':
elif isinstance(match, lxml.html.HtmlElement): match = follow.partition('"')
out += lxml.html.tostring(match) out = match[0]
return out if len(match) >= 2:
next = match[2]
else:
next = None
elif char == '{':
match = follow.partition('}')
try:
test = formatString(match[0], getter, True)
except ValueError, KeyError:
pass
else:
out = test
next = match[2]
elif char == ' ':
next = follow
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
rawValue = getter(match[0])
print repr(rawValue)
if not isinstance(rawValue, basestring):
if match[1] is not None:
out = match[1].join(rawValue)
else:
out = ''.join(rawValue)
if not out and error:
raise ValueError
next = match[2]
else: else:
return '' raise ValueError('bogus string')
def build(link, data=None): if next is not None and len(next):
rule = getRule(link) return out + formatString(next, getter, error)
if rule is False: else:
return False return out
if data is None: class Builder(object):
data = urllib2.urlopen(link).read() def __init__(self, link, data=None):
self.link = link
html = lxml.html.fromstring(data) if data is None:
feed = feeds.FeedParserAtom() data = urllib2.urlopen(link).read()
self.data = data
if 'title' in rule: self.rule = getRule(link)
feed.title = getString(html, rule['title'])
if 'items' in rule: if self.rule['mode'] == 'xpath':
for item in html.xpath(toclass(rule['items'])): self.data = morss.decodeHTML(self.data)
feedItem = {} self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json':
self.doc = json.loads(data)
if 'item_title' in rule: self.feed = feeds.FeedParserAtom()
feedItem['title'] = getString(item, rule['item_title'])
if 'item_link' in rule:
url = getString(item, rule['item_link'])
url = urlparse.urljoin(link, url)
feedItem['link'] = url
if 'item_desc' in rule:
feedItem['desc'] = getString(item, rule['item_desc'])
if 'item_content' in rule:
feedItem['content'] = getString(item, rule['item_content'])
if 'item_time' in rule:
feedItem['updated'] = getString(item, rule['item_time'])
feed.items.append(feedItem) def raw(self, html, expr):
return feed if self.rule['mode'] == 'xpath':
print 1, toclass(expr)
return html.xpath(toclass(expr))
elif self.rule['mode'] == 'json':
a = [html]
b = []
for x in expr.strip(".").split("."):
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
for elem in a:
if isinstance(elem, dict):
kids = elem.get(match[0])
if kids is None:
pass
elif isinstance(kids, list):
[b.append(i) for i in kids]
elif isinstance(kids, basestring):
b.append(kids.replace('\n', '<br/>'))
else:
b.append(kids)
if match[1] is None:
a = b
else:
if len(b)-1 >= int(match[1]):
a = [b[int(match[1])]]
else:
a = []
b = []
return a
def strings(self, html, expr):
if self.rule['mode'] == 'xpath':
out = []
for match in self.raw(html, expr):
if isinstance(match, basestring):
out.append(match)
elif isinstance(match, lxml.html.HtmlElement):
out.append(lxml.html.tostring(match))
return out
elif self.rule['mode'] == 'json':
return self.raw(html, expr)
def string(self, html, expr):
getter = lambda x: self.strings(html, x)
return formatString(self.rule[expr], getter)
def build(self):
if 'title' in self.rule:
self.feed.title = self.string(self.doc, 'title')
if 'items' in self.rule:
matches = self.raw(self.doc, self.rule['items'])
if matches and len(matches):
for item in matches:
feedItem = {}
if 'item_title' in self.rule:
feedItem['title'] = self.string(item, 'item_title')
if 'item_link' in self.rule:
url = self.string(item, 'item_link')
url = urlparse.urljoin(self.link, url)
feedItem['link'] = url
if 'item_desc' in self.rule:
feedItem['desc'] = self.string(item, 'item_desc')
if 'item_content' in self.rule:
feedItem['content'] = self.string(item, 'item_content')
if 'item_time' in self.rule:
feedItem['updated'] = self.string(item, 'item_time')
self.feed.items.append(feedItem)

View File

@ -410,8 +410,9 @@ def Gather(url, cachePath, options):
if style == 'normal': if style == 'normal':
rss = feeds.parse(xml) rss = feeds.parse(xml)
elif style == 'feedify': elif style == 'feedify':
xml = decodeHTML(xml) feed = feedify.Builder(url, xml)
rss = feedify.build(url, xml) feed.build()
rss = feed.feed
elif style == 'html': elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match): if len(match):