morss/feedify.py

78 lines
1.8 KiB
Python

#!/usr/bin/env python
from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import re
import urllib2
import lxml.html
import urlparse
def toclass(query):
pattern = r'\[class=([^\]]+)\]'
repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
return re.sub(pattern, repl, query)
def getRule(link):
config = ConfigParser()
config.read('feedify.ini')
for section in config.sections():
values = dict(config.items(section))
values['path'] = values['path'].split('\n')[1:]
for path in values['path']:
if fnmatch(link, path):
return values
return False
def supported(link):
return getRule(link) is not False
def getString(html, expr):
matches = html.xpath(toclass(expr))
if len(matches):
out = ''
for match in matches:
if isinstance(match, basestring):
out += match
elif isinstance(match, lxml.html.HtmlElement):
out += lxml.html.tostring(match)
return out
else:
return ''
def build(link, data=None):
rule = getRule(link)
if rule is False:
return False
if data is None:
data = urllib2.urlopen(link).read()
html = lxml.html.fromstring(data)
feed = feeds.FeedParserAtom()
if 'title' in rule:
feed.title = getString(html, rule['title'])
if 'items' in rule:
for item in html.xpath(toclass(rule['items'])):
feedItem = {}
if 'item_title' in rule:
feedItem['title'] = getString(item, rule['item_title'])
if 'item_link' in rule:
url = getString(item, rule['item_link'])
url = urlparse.urljoin(link, url)
feedItem['link'] = url
if 'item_desc' in rule:
feedItem['desc'] = getString(item, rule['item_desc'])
if 'item_content' in rule:
feedItem['content'] = getString(item, rule['item_content'])
if 'item_time' in rule:
feedItem['updated'] = getString(item, rule['item_time'])
feed.items.append(feedItem)
return feed