Add support for JSON APIs in feedify

2013-10-21 21:28:43 +02:00 · 2013-10-21 21:28:43 +02:00 · 1f40f2a099
parent 1802827d31
commit 1f40f2a099
2 changed files with 133 additions and 40 deletions
--- a/feedify.py
+++ b/feedify.py
@ -3,10 +3,12 @@
 from ConfigParser import ConfigParser
 from fnmatch import fnmatch
 import feeds
 import morss
 import re
 import urllib2
 import lxml.html
 import json
 import urlparse
 def toclass(query):
@ -29,49 +31,139 @@ def getRule(link):
 def supported(link):
 	return getRule(link) is not False
-def getString(html, expr):
+def formatString(string, getter, error=False):
-	matches = html.xpath(toclass(expr))
+	out = ""
-	if len(matches):
+	char = string[0]
-		out = ''
+
-		for match in matches:
+	follow = string[1:]
-			if isinstance(match, basestring):
+
-				out += match
+	if char == '"':
-			elif isinstance(match, lxml.html.HtmlElement):
+		match = follow.partition('"')
-				out += lxml.html.tostring(match)
+		out = match[0]
-		return out
+		if len(match) >= 2:
 			next = match[2]
 		else:
 			next = None
 	elif char == '{':
 		match = follow.partition('}')
 		try:
 			test = formatString(match[0], getter, True)
 		except ValueError, KeyError:
 			pass
 		else:
 			out = test
 		next = match[2]
 	elif char == ' ':
 		next = follow
 	elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
 		match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
 		rawValue = getter(match[0])
 		print repr(rawValue)
 		if not isinstance(rawValue, basestring):
 			if match[1] is not None:
 				out = match[1].join(rawValue)
 			else:
 				out = ''.join(rawValue)
 		if not out and error:
 			raise ValueError
 		next = match[2]
 	else:
-		return ''
+		raise ValueError('bogus string')
-def build(link, data=None):
+	if next is not None and len(next):
-	rule = getRule(link)
+		return out + formatString(next, getter, error)
-	if rule is False:
+	else:
-		return False
+		return out
-	if data is None:
+class Builder(object):
-		data = urllib2.urlopen(link).read()
+	def __init__(self, link, data=None):
 		self.link = link
-	html = lxml.html.fromstring(data)
+		if data is None:
-	feed = feeds.FeedParserAtom()
+			data = urllib2.urlopen(link).read()
 		self.data = data
-	if 'title' in rule:
+		self.rule = getRule(link)
 		feed.title = getString(html, rule['title'])
-	if 'items' in rule:
+		if self.rule['mode'] == 'xpath':
-		for item in html.xpath(toclass(rule['items'])):
+			self.data = morss.decodeHTML(self.data)
-			feedItem = {}
+			self.doc = lxml.html.fromstring(self.data)
 		elif self.rule['mode'] == 'json':
 			self.doc = json.loads(data)
-			if 'item_title' in rule:
+		self.feed = feeds.FeedParserAtom()
 				feedItem['title'] = getString(item, rule['item_title'])
 			if 'item_link' in rule:
 				url = getString(item, rule['item_link'])
 				url = urlparse.urljoin(link, url)
 				feedItem['link'] = url
 			if 'item_desc' in rule:
 				feedItem['desc'] = getString(item, rule['item_desc'])
 			if 'item_content' in rule:
 				feedItem['content'] = getString(item, rule['item_content'])
 			if 'item_time' in rule:
 				feedItem['updated'] = getString(item, rule['item_time'])
-			feed.items.append(feedItem)
+	def raw(self, html, expr):
-	return feed
+		if self.rule['mode'] == 'xpath':
 			print 1, toclass(expr)
 			return html.xpath(toclass(expr))
 		elif self.rule['mode'] == 'json':
 			a = [html]
 			b = []
 			for x in expr.strip(".").split("."):
 				match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
 				for elem in a:
 					if isinstance(elem, dict):
 						kids = elem.get(match[0])
 						if kids is None:
 							pass
 						elif isinstance(kids, list):
 							[b.append(i) for i in kids]
 						elif isinstance(kids, basestring):
 							b.append(kids.replace('\n', '<br/>'))
 						else:
 							b.append(kids)
 				if match[1] is None:
 					a = b
 				else:
 					if len(b)-1 >= int(match[1]):
 						a = [b[int(match[1])]]
 					else:
 						a = []
 				b = []
 			return a
 	def strings(self, html, expr):
 		if self.rule['mode'] == 'xpath':
 			out = []
 			for match in self.raw(html, expr):
 				if isinstance(match, basestring):
 					out.append(match)
 				elif isinstance(match, lxml.html.HtmlElement):
 					out.append(lxml.html.tostring(match))
 			return out
 		elif self.rule['mode'] == 'json':
 			return self.raw(html, expr)
 	def string(self, html, expr):
 		getter = lambda x: self.strings(html, x)
 		return formatString(self.rule[expr], getter)
 	def build(self):
 		if 'title' in self.rule:
 			self.feed.title = self.string(self.doc, 'title')
 		if 'items' in self.rule:
 			matches = self.raw(self.doc, self.rule['items'])
 			if matches and len(matches):
 				for item in matches:
 					feedItem = {}
 					if 'item_title' in self.rule:
 						feedItem['title'] = self.string(item, 'item_title')
 					if 'item_link' in self.rule:
 						url = self.string(item, 'item_link')
 						url = urlparse.urljoin(self.link, url)
 						feedItem['link'] = url
 					if 'item_desc' in self.rule:
 						feedItem['desc'] = self.string(item, 'item_desc')
 					if 'item_content' in self.rule:
 						feedItem['content'] = self.string(item, 'item_content')
 					if 'item_time' in self.rule:
 						feedItem['updated'] = self.string(item, 'item_time')
 					self.feed.items.append(feedItem)
--- a/morss.py
+++ b/morss.py
@ -410,8 +410,9 @@ def Gather(url, cachePath, options):
 	if style == 'normal':
 		rss = feeds.parse(xml)
 	elif style == 'feedify':
-		xml = decodeHTML(xml)
+		feed = feedify.Builder(url, xml)
-		rss = feedify.build(url, xml)
+		feed.build()
 		rss = feed.feed
 	elif style == 'html':
 		match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
 		if len(match):