From 685bc7988eb5eaba92d9570af315ba2c64de2cae Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 1 Oct 2013 20:18:55 +0200 Subject: [PATCH] Improve feedify string grabbing --- feedify.ini | 2 +- feedify.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/feedify.ini b/feedify.ini index 7e11866..36b1728 100644 --- a/feedify.ini +++ b/feedify.ini @@ -6,6 +6,6 @@ path= title= //head/title/text() items= //div[class=tweet] -item_title= (.//span[class=username]//text())[2] +item_title= .//span[class=username]//text() item_link= .//a[class=details]/@href item_content= .//p[class=tweet-text] diff --git a/feedify.py b/feedify.py index fd29ac7..3f94f2e 100644 --- a/feedify.py +++ b/feedify.py @@ -14,7 +14,7 @@ def toclass(query): repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' return re.sub(pattern, repl, query) -def getRule(link=URL): +def getRule(link): config = ConfigParser() config.read('feedify.ini') @@ -29,10 +29,16 @@ def getRule(link=URL): def supported(link): return getRule(link) is not False -def getString(expr, html): - match = html.xpath(toclass(expr)) - if len(match): - return match[0].text_content() +def getString(html, expr): + matches = html.xpath(toclass(expr)) + if len(matches): + out = '' + for match in matches: + if isinstance(match, basestring): + out += match + elif isinstance(match, lxml.html.HtmlElement): + out += lxml.html.tostring(match) + return out else: return '' @@ -48,22 +54,22 @@ def build(link, data=None): feed = feeds.FeedParserAtom() if 'title' in rule: - feed.title = html.xpath(toclass(rule['title']))[0] + feed.title = getString(html, rule['title']) if 'items' in rule: for item in html.xpath(toclass(rule['items'])): feedItem = {} if 'item_title' in rule: - feedItem['title'] = item.xpath(toclass(rule['item_title']))[0] + feedItem['title'] = getString(item, rule['item_title']) if 'item_link' in rule: - url = item.xpath(toclass(rule['item_link']))[0] + url = getString(item, rule['item_link']) url = urlparse.urljoin(link, url) feedItem['link'] = url if 'item_desc' in rule: - feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode') + feedItem['desc'] = getString(item, rule['item_desc']) if 'item_content' in rule: - feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0]) + feedItem['content'] = getString(item, rule['item_content']) feed.items.append(feedItem) return feed