morss/feedify.py

#!/usr/bin/env python

from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import re

import urllib2
import lxml.html
import urlparse

def toclass(query):
	pattern = r'\[class=([^\]]+)\]'
	repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
	return re.sub(pattern, repl, query)

def getRule(link=URL):
	config = ConfigParser()
	config.read('feedify.ini')

	for section in config.sections():
		values = dict(config.items(section))
		values['path'] = values['path'].split('\n')[1:]
		for path in values['path']:
			if fnmatch(link, path):
				return values
	return False

def supported(link):
	return getRule(link) is not False

def getString(expr, html):
	match = html.xpath(toclass(expr))
	if len(match):
		return match[0].text_content()
	else:
		return ''

def build(link, data=None):
	rule = getRule(link)
	if rule is False:
		return False

	if data is None:
		data = urllib2.urlopen(link).read()

	html = lxml.html.fromstring(data)
	feed = feeds.FeedParserAtom()

	if 'title' in rule:
		feed.title = html.xpath(toclass(rule['title']))[0]

	if 'items' in rule:
		for item in html.xpath(toclass(rule['items'])):
			feedItem = {}

			if 'item_title' in rule:
				feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]
			if 'item_link' in rule:
				url = item.xpath(toclass(rule['item_link']))[0]
				url = urlparse.urljoin(link, url)
				feedItem['link'] = url
			if 'item_desc' in rule:
				feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')
			if 'item_content' in rule:
				feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])

			feed.items.append(feedItem)
	return feed
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`#!/usr/bin/env python`

			`from ConfigParser import ConfigParser`
			`from fnmatch import fnmatch`
			`import feeds`
			`import re`

			`import urllib2`
			`import lxml.html`
			`import urlparse`

			`def toclass(query):`
			`pattern = r'\[class=([^\]]+)\]'`
			`repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'`
			`return re.sub(pattern, repl, query)`

			`def getRule(link=URL):`
			`config = ConfigParser()`
			`config.read('feedify.ini')`

			`for section in config.sections():`
			`values = dict(config.items(section))`
			`values['path'] = values['path'].split('\n')[1:]`
			`for path in values['path']:`
			`if fnmatch(link, path):`
			`return values`
			`return False`

			`def supported(link):`
			`return getRule(link) is not False`

			`def getString(expr, html):`
			`match = html.xpath(toclass(expr))`
			`if len(match):`
			`return match[0].text_content()`
			`else:`
			`return ''`

			`def build(link, data=None):`
			`rule = getRule(link)`
			`if rule is False:`
			`return False`

			`if data is None:`
			`data = urllib2.urlopen(link).read()`

			`html = lxml.html.fromstring(data)`
			`feed = feeds.FeedParserAtom()`

			`if 'title' in rule:`
			`feed.title = html.xpath(toclass(rule['title']))[0]`

			`if 'items' in rule:`
			`for item in html.xpath(toclass(rule['items'])):`
			`feedItem = {}`

			`if 'item_title' in rule:`
			`feedItem['title'] = item.xpath(toclass(rule['item_title']))[0]`
			`if 'item_link' in rule:`
			`url = item.xpath(toclass(rule['item_link']))[0]`
			`url = urlparse.urljoin(link, url)`
			`feedItem['link'] = url`
			`if 'item_desc' in rule:`
			`feedItem['desc'] = lxml.html.tostring(item.xpath(toclass(rule['item_desc']))[0], encoding='unicode')`
			`if 'item_content' in rule:`
			`feedItem['content'] = lxml.html.tostring(item.xpath(toclass(rule['item_content']))[0])`

			`feed.items.append(feedItem)`
			`return feed`