From 1f40f2a099ba1437b5a9fe4d3dd40acec453b31b Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 21 Oct 2013 21:28:43 +0200 Subject: [PATCH] Add support for JSON APIs in feedify --- feedify.py | 168 +++++++++++++++++++++++++++++++++++++++++------------ morss.py | 5 +- 2 files changed, 133 insertions(+), 40 deletions(-) diff --git a/feedify.py b/feedify.py index db5cc3f..340e7f2 100644 --- a/feedify.py +++ b/feedify.py @@ -3,10 +3,12 @@ from ConfigParser import ConfigParser from fnmatch import fnmatch import feeds +import morss import re import urllib2 import lxml.html +import json import urlparse def toclass(query): @@ -29,49 +31,139 @@ def getRule(link): def supported(link): return getRule(link) is not False -def getString(html, expr): - matches = html.xpath(toclass(expr)) - if len(matches): - out = '' - for match in matches: - if isinstance(match, basestring): - out += match - elif isinstance(match, lxml.html.HtmlElement): - out += lxml.html.tostring(match) - return out +def formatString(string, getter, error=False): + out = "" + char = string[0] + + follow = string[1:] + + if char == '"': + match = follow.partition('"') + out = match[0] + if len(match) >= 2: + next = match[2] + else: + next = None + elif char == '{': + match = follow.partition('}') + try: + test = formatString(match[0], getter, True) + except ValueError, KeyError: + pass + else: + out = test + + next = match[2] + elif char == ' ': + next = follow + elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string): + match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups() + rawValue = getter(match[0]) + print repr(rawValue) + if not isinstance(rawValue, basestring): + if match[1] is not None: + out = match[1].join(rawValue) + else: + out = ''.join(rawValue) + if not out and error: + raise ValueError + next = match[2] else: - return '' + raise ValueError('bogus string') -def build(link, data=None): - rule = getRule(link) - if rule is False: - return False + if next is not None and len(next): + return out + formatString(next, getter, error) + else: + return out - if data is None: - data = urllib2.urlopen(link).read() +class Builder(object): + def __init__(self, link, data=None): + self.link = link - html = lxml.html.fromstring(data) - feed = feeds.FeedParserAtom() + if data is None: + data = urllib2.urlopen(link).read() + self.data = data - if 'title' in rule: - feed.title = getString(html, rule['title']) + self.rule = getRule(link) - if 'items' in rule: - for item in html.xpath(toclass(rule['items'])): - feedItem = {} + if self.rule['mode'] == 'xpath': + self.data = morss.decodeHTML(self.data) + self.doc = lxml.html.fromstring(self.data) + elif self.rule['mode'] == 'json': + self.doc = json.loads(data) - if 'item_title' in rule: - feedItem['title'] = getString(item, rule['item_title']) - if 'item_link' in rule: - url = getString(item, rule['item_link']) - url = urlparse.urljoin(link, url) - feedItem['link'] = url - if 'item_desc' in rule: - feedItem['desc'] = getString(item, rule['item_desc']) - if 'item_content' in rule: - feedItem['content'] = getString(item, rule['item_content']) - if 'item_time' in rule: - feedItem['updated'] = getString(item, rule['item_time']) + self.feed = feeds.FeedParserAtom() - feed.items.append(feedItem) - return feed + def raw(self, html, expr): + if self.rule['mode'] == 'xpath': + print 1, toclass(expr) + return html.xpath(toclass(expr)) + + elif self.rule['mode'] == 'json': + a = [html] + b = [] + for x in expr.strip(".").split("."): + match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() + for elem in a: + if isinstance(elem, dict): + kids = elem.get(match[0]) + if kids is None: + pass + elif isinstance(kids, list): + [b.append(i) for i in kids] + elif isinstance(kids, basestring): + b.append(kids.replace('\n', '
')) + else: + b.append(kids) + + if match[1] is None: + a = b + else: + if len(b)-1 >= int(match[1]): + a = [b[int(match[1])]] + else: + a = [] + b = [] + return a + + def strings(self, html, expr): + if self.rule['mode'] == 'xpath': + out = [] + for match in self.raw(html, expr): + if isinstance(match, basestring): + out.append(match) + elif isinstance(match, lxml.html.HtmlElement): + out.append(lxml.html.tostring(match)) + return out + + elif self.rule['mode'] == 'json': + return self.raw(html, expr) + + def string(self, html, expr): + getter = lambda x: self.strings(html, x) + return formatString(self.rule[expr], getter) + + def build(self): + if 'title' in self.rule: + self.feed.title = self.string(self.doc, 'title') + + if 'items' in self.rule: + matches = self.raw(self.doc, self.rule['items']) + if matches and len(matches): + for item in matches: + feedItem = {} + + if 'item_title' in self.rule: + feedItem['title'] = self.string(item, 'item_title') + if 'item_link' in self.rule: + url = self.string(item, 'item_link') + url = urlparse.urljoin(self.link, url) + feedItem['link'] = url + if 'item_desc' in self.rule: + feedItem['desc'] = self.string(item, 'item_desc') + if 'item_content' in self.rule: + feedItem['content'] = self.string(item, 'item_content') + if 'item_time' in self.rule: + feedItem['updated'] = self.string(item, 'item_time') + + self.feed.items.append(feedItem) diff --git a/morss.py b/morss.py index ac36be0..5a3a49c 100644 --- a/morss.py +++ b/morss.py @@ -410,8 +410,9 @@ def Gather(url, cachePath, options): if style == 'normal': rss = feeds.parse(xml) elif style == 'feedify': - xml = decodeHTML(xml) - rss = feedify.build(url, xml) + feed = feedify.Builder(url, xml) + feed.build() + rss = feed.feed elif style == 'html': match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") if len(match):