Improve feedify string grabbing

2013-10-01 20:18:55 +02:00
parent 050be7690d
commit 685bc7988e
2 changed files with 17 additions and 11 deletions
--- a/feedify.ini
+++ b/feedify.ini
@@ -6,6 +6,6 @@ path=
 title=	//head/title/text()
 items=	//div[class=tweet]

-item_title=	(.//span[class=username]//text())[2]
+item_title=	.//span[class=username]//text()
 item_link=	.//a[class=details]/@href
 item_content=	.//p[class=tweet-text]
--- a/feedify.py
+++ b/feedify.py
@@ -14,7 +14,7 @@ def toclass(query):
 	repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
 	return re.sub(pattern, repl, query)

-def getRule(link=URL):
+def getRule(link):
 	config = ConfigParser()
 	config.read('feedify.ini')

@@ -29,10 +29,16 @@ def getRule(link=URL):
 def supported(link):
 	return getRule(link) is not False

-def getString(expr, html):
-	match = html.xpath(toclass(expr))
-	if len(match):
-		return match[0].text_content()
+def getString(html, expr):
+	matches = html.xpath(toclass(expr))
+	if len(matches):
+		out = ''
+		for match in matches:
+			if isinstance(match, basestring):
+				out += match
+			elif isinstance(match, lxml.html.HtmlElement):
+				out += lxml.html.tostring(match)
+		return out
 	else:
 		return ''

@@ -48,22 +54,22 @@ def build(link, data=None):
 	feed = feeds.FeedParserAtom()

 	if 'title' in rule:
-		feed.title = html.xpath(toclass(rule['title']))[0]
+		feed.title = getString(html, rule['title'])

 	if 'items' in rule:
 		for item in html.xpath(toclass(rule['items'])):
 			feedItem = {}

 			if 'item_title' in rule:
-				feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]
+				feedItem['title'] = getString(item, rule['item_title'])
 			if 'item_link' in rule:
-				url = item.xpath(toclass(rule['item_link']))[0]
+				url = getString(item, rule['item_link'])
 				url = urlparse.urljoin(link, url)
 				feedItem['link'] = url
 			if 'item_desc' in rule:
-				feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')
+				feedItem['desc'] = getString(item, rule['item_desc'])
 			if 'item_content' in rule:
-				feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])
+				feedItem['content'] = getString(item, rule['item_content'])

 			feed.items.append(feedItem)
 	return feed