Add support for JSON APIs in feedify
parent
1802827d31
commit
1f40f2a099
162
feedify.py
162
feedify.py
|
@ -3,10 +3,12 @@
|
||||||
from ConfigParser import ConfigParser
|
from ConfigParser import ConfigParser
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
import feeds
|
import feeds
|
||||||
|
import morss
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import urllib2
|
import urllib2
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
import json
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
def toclass(query):
|
def toclass(query):
|
||||||
|
@ -29,49 +31,139 @@ def getRule(link):
|
||||||
def supported(link):
|
def supported(link):
|
||||||
return getRule(link) is not False
|
return getRule(link) is not False
|
||||||
|
|
||||||
def getString(html, expr):
|
def formatString(string, getter, error=False):
|
||||||
matches = html.xpath(toclass(expr))
|
out = ""
|
||||||
if len(matches):
|
char = string[0]
|
||||||
out = ''
|
|
||||||
for match in matches:
|
|
||||||
if isinstance(match, basestring):
|
|
||||||
out += match
|
|
||||||
elif isinstance(match, lxml.html.HtmlElement):
|
|
||||||
out += lxml.html.tostring(match)
|
|
||||||
return out
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def build(link, data=None):
|
follow = string[1:]
|
||||||
rule = getRule(link)
|
|
||||||
if rule is False:
|
if char == '"':
|
||||||
return False
|
match = follow.partition('"')
|
||||||
|
out = match[0]
|
||||||
|
if len(match) >= 2:
|
||||||
|
next = match[2]
|
||||||
|
else:
|
||||||
|
next = None
|
||||||
|
elif char == '{':
|
||||||
|
match = follow.partition('}')
|
||||||
|
try:
|
||||||
|
test = formatString(match[0], getter, True)
|
||||||
|
except ValueError, KeyError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
out = test
|
||||||
|
|
||||||
|
next = match[2]
|
||||||
|
elif char == ' ':
|
||||||
|
next = follow
|
||||||
|
elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
|
||||||
|
match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
|
||||||
|
rawValue = getter(match[0])
|
||||||
|
print repr(rawValue)
|
||||||
|
if not isinstance(rawValue, basestring):
|
||||||
|
if match[1] is not None:
|
||||||
|
out = match[1].join(rawValue)
|
||||||
|
else:
|
||||||
|
out = ''.join(rawValue)
|
||||||
|
if not out and error:
|
||||||
|
raise ValueError
|
||||||
|
next = match[2]
|
||||||
|
else:
|
||||||
|
raise ValueError('bogus string')
|
||||||
|
|
||||||
|
if next is not None and len(next):
|
||||||
|
return out + formatString(next, getter, error)
|
||||||
|
else:
|
||||||
|
return out
|
||||||
|
|
||||||
|
class Builder(object):
|
||||||
|
def __init__(self, link, data=None):
|
||||||
|
self.link = link
|
||||||
|
|
||||||
if data is None:
|
if data is None:
|
||||||
data = urllib2.urlopen(link).read()
|
data = urllib2.urlopen(link).read()
|
||||||
|
self.data = data
|
||||||
|
|
||||||
html = lxml.html.fromstring(data)
|
self.rule = getRule(link)
|
||||||
feed = feeds.FeedParserAtom()
|
|
||||||
|
|
||||||
if 'title' in rule:
|
if self.rule['mode'] == 'xpath':
|
||||||
feed.title = getString(html, rule['title'])
|
self.data = morss.decodeHTML(self.data)
|
||||||
|
self.doc = lxml.html.fromstring(self.data)
|
||||||
|
elif self.rule['mode'] == 'json':
|
||||||
|
self.doc = json.loads(data)
|
||||||
|
|
||||||
if 'items' in rule:
|
self.feed = feeds.FeedParserAtom()
|
||||||
for item in html.xpath(toclass(rule['items'])):
|
|
||||||
|
def raw(self, html, expr):
|
||||||
|
if self.rule['mode'] == 'xpath':
|
||||||
|
print 1, toclass(expr)
|
||||||
|
return html.xpath(toclass(expr))
|
||||||
|
|
||||||
|
elif self.rule['mode'] == 'json':
|
||||||
|
a = [html]
|
||||||
|
b = []
|
||||||
|
for x in expr.strip(".").split("."):
|
||||||
|
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
||||||
|
for elem in a:
|
||||||
|
if isinstance(elem, dict):
|
||||||
|
kids = elem.get(match[0])
|
||||||
|
if kids is None:
|
||||||
|
pass
|
||||||
|
elif isinstance(kids, list):
|
||||||
|
[b.append(i) for i in kids]
|
||||||
|
elif isinstance(kids, basestring):
|
||||||
|
b.append(kids.replace('\n', '<br/>'))
|
||||||
|
else:
|
||||||
|
b.append(kids)
|
||||||
|
|
||||||
|
if match[1] is None:
|
||||||
|
a = b
|
||||||
|
else:
|
||||||
|
if len(b)-1 >= int(match[1]):
|
||||||
|
a = [b[int(match[1])]]
|
||||||
|
else:
|
||||||
|
a = []
|
||||||
|
b = []
|
||||||
|
return a
|
||||||
|
|
||||||
|
def strings(self, html, expr):
|
||||||
|
if self.rule['mode'] == 'xpath':
|
||||||
|
out = []
|
||||||
|
for match in self.raw(html, expr):
|
||||||
|
if isinstance(match, basestring):
|
||||||
|
out.append(match)
|
||||||
|
elif isinstance(match, lxml.html.HtmlElement):
|
||||||
|
out.append(lxml.html.tostring(match))
|
||||||
|
return out
|
||||||
|
|
||||||
|
elif self.rule['mode'] == 'json':
|
||||||
|
return self.raw(html, expr)
|
||||||
|
|
||||||
|
def string(self, html, expr):
|
||||||
|
getter = lambda x: self.strings(html, x)
|
||||||
|
return formatString(self.rule[expr], getter)
|
||||||
|
|
||||||
|
def build(self):
|
||||||
|
if 'title' in self.rule:
|
||||||
|
self.feed.title = self.string(self.doc, 'title')
|
||||||
|
|
||||||
|
if 'items' in self.rule:
|
||||||
|
matches = self.raw(self.doc, self.rule['items'])
|
||||||
|
if matches and len(matches):
|
||||||
|
for item in matches:
|
||||||
feedItem = {}
|
feedItem = {}
|
||||||
|
|
||||||
if 'item_title' in rule:
|
if 'item_title' in self.rule:
|
||||||
feedItem['title'] = getString(item, rule['item_title'])
|
feedItem['title'] = self.string(item, 'item_title')
|
||||||
if 'item_link' in rule:
|
if 'item_link' in self.rule:
|
||||||
url = getString(item, rule['item_link'])
|
url = self.string(item, 'item_link')
|
||||||
url = urlparse.urljoin(link, url)
|
url = urlparse.urljoin(self.link, url)
|
||||||
feedItem['link'] = url
|
feedItem['link'] = url
|
||||||
if 'item_desc' in rule:
|
if 'item_desc' in self.rule:
|
||||||
feedItem['desc'] = getString(item, rule['item_desc'])
|
feedItem['desc'] = self.string(item, 'item_desc')
|
||||||
if 'item_content' in rule:
|
if 'item_content' in self.rule:
|
||||||
feedItem['content'] = getString(item, rule['item_content'])
|
feedItem['content'] = self.string(item, 'item_content')
|
||||||
if 'item_time' in rule:
|
if 'item_time' in self.rule:
|
||||||
feedItem['updated'] = getString(item, rule['item_time'])
|
feedItem['updated'] = self.string(item, 'item_time')
|
||||||
|
|
||||||
feed.items.append(feedItem)
|
self.feed.items.append(feedItem)
|
||||||
return feed
|
|
||||||
|
|
5
morss.py
5
morss.py
|
@ -410,8 +410,9 @@ def Gather(url, cachePath, options):
|
||||||
if style == 'normal':
|
if style == 'normal':
|
||||||
rss = feeds.parse(xml)
|
rss = feeds.parse(xml)
|
||||||
elif style == 'feedify':
|
elif style == 'feedify':
|
||||||
xml = decodeHTML(xml)
|
feed = feedify.Builder(url, xml)
|
||||||
rss = feedify.build(url, xml)
|
feed.build()
|
||||||
|
rss = feed.feed
|
||||||
elif style == 'html':
|
elif style == 'html':
|
||||||
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||||
if len(match):
|
if len(match):
|
||||||
|
|
Loading…
Reference in New Issue