diff --git a/feedify.py b/feedify.py new file mode 100644 index 0000000..fd29ac7 --- /dev/null +++ b/feedify.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +from ConfigParser import ConfigParser +from fnmatch import fnmatch +import feeds +import re + +import urllib2 +import lxml.html +import urlparse + +def toclass(query): + pattern = r'\[class=([^\]]+)\]' + repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]' + return re.sub(pattern, repl, query) + +def getRule(link=URL): + config = ConfigParser() + config.read('feedify.ini') + + for section in config.sections(): + values = dict(config.items(section)) + values['path'] = values['path'].split('\n')[1:] + for path in values['path']: + if fnmatch(link, path): + return values + return False + +def supported(link): + return getRule(link) is not False + +def getString(expr, html): + match = html.xpath(toclass(expr)) + if len(match): + return match[0].text_content() + else: + return '' + +def build(link, data=None): + rule = getRule(link) + if rule is False: + return False + + if data is None: + data = urllib2.urlopen(link).read() + + html = lxml.html.fromstring(data) + feed = feeds.FeedParserAtom() + + if 'title' in rule: + feed.title = html.xpath(toclass(rule['title']))[0] + + if 'items' in rule: + for item in html.xpath(toclass(rule['items'])): + feedItem = {} + + if 'item_title' in rule: + feedItem['title'] = item.xpath(toclass(rule['item_title']))[0] + if 'item_link' in rule: + url = item.xpath(toclass(rule['item_link']))[0] + url = urlparse.urljoin(link, url) + feedItem['link'] = url + if 'item_desc' in rule: + feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode') + if 'item_content' in rule: + feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0]) + + feed.items.append(feedItem) + return feed diff --git a/morss.py b/morss.py index 579430f..dd6bcfb 100644 --- a/morss.py +++ b/morss.py @@ -13,6 +13,7 @@ import lxml.html.clean import lxml.builder import feeds +import feedify import httplib import urllib2 @@ -377,6 +378,8 @@ def Gather(url, cachePath, progress=False): if xml[:5] == '