morss/feedify.py

#!/usr/bin/env python

from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import re

import urllib2
import lxml.html
import urlparse

def toclass(query):
	pattern = r'\[class=([^\]]+)\]'
	repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
	return re.sub(pattern, repl, query)

def getRule(link):
	config = ConfigParser()
	config.read('feedify.ini')

	for section in config.sections():
		values = dict(config.items(section))
		values['path'] = values['path'].split('\n')[1:]
		for path in values['path']:
			if fnmatch(link, path):
				return values
	return False

def supported(link):
	return getRule(link) is not False

def getString(html, expr):
	matches = html.xpath(toclass(expr))
	if len(matches):
		out = ''
		for match in matches:
			if isinstance(match, basestring):
				out += match
			elif isinstance(match, lxml.html.HtmlElement):
				out += lxml.html.tostring(match)
		return out
	else:
		return ''

def build(link, data=None):
	rule = getRule(link)
	if rule is False:
		return False

	if data is None:
		data = urllib2.urlopen(link).read()

	html = lxml.html.fromstring(data)
	feed = feeds.FeedParserAtom()

	if 'title' in rule:
		feed.title = getString(html, rule['title'])

	if 'items' in rule:
		for item in html.xpath(toclass(rule['items'])):
			feedItem = {}

			if 'item_title' in rule:
				feedItem['title'] = getString(item, rule['item_title'])
			if 'item_link' in rule:
				url = getString(item, rule['item_link'])
				url = urlparse.urljoin(link, url)
				feedItem['link'] = url
			if 'item_desc' in rule:
				feedItem['desc'] = getString(item, rule['item_desc'])
			if 'item_content' in rule:
				feedItem['content'] = getString(item, rule['item_content'])
			if 'item_time' in rule:
				feedItem['updated'] = getString(item, rule['item_time'])

			feed.items.append(feedItem)
	return feed