morss/feedify.py

#!/usr/bin/env python

from ConfigParser import ConfigParser
from fnmatch import fnmatch
import feeds
import morss
import re

import urllib2
import lxml.html
import json
import urlparse

def toclass(query):
	pattern = r'\[class=([^\]]+)\]'
	repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'
	return re.sub(pattern, repl, query)

def getRule(link):
	config = ConfigParser()
	config.read('feedify.ini')

	for section in config.sections():
		values = dict(config.items(section))
		values['path'] = values['path'].split('\n')[1:]
		for path in values['path']:
			if fnmatch(link, path):
				return values
	return False

def supported(link):
	return getRule(link) is not False

def formatString(string, getter, error=False):
	out = ""
	char = string[0]

	follow = string[1:]

	if char == '"':
		match = follow.partition('"')
		out = match[0]
		if len(match) >= 2:
			next = match[2]
		else:
			next = None
	elif char == '{':
		match = follow.partition('}')
		try:
			test = formatString(match[0], getter, True)
		except ValueError, KeyError:
			pass
		else:
			out = test

		next = match[2]
	elif char == ' ':
		next = follow
	elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):
		match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()
		rawValue = getter(match[0])
		print repr(rawValue)
		if not isinstance(rawValue, basestring):
			if match[1] is not None:
				out = match[1].join(rawValue)
			else:
				out = ''.join(rawValue)
		if not out and error:
			raise ValueError
		next = match[2]
	else:
		raise ValueError('bogus string')

	if next is not None and len(next):
		return out + formatString(next, getter, error)
	else:
		return out

class Builder(object):
	def __init__(self, link, data=None):
		self.link = link

		if data is None:
			data = urllib2.urlopen(link).read()
		self.data = data

		self.rule = getRule(link)

		if self.rule['mode'] == 'xpath':
			self.data = morss.decodeHTML(self.data)
			self.doc = lxml.html.fromstring(self.data)
		elif self.rule['mode'] == 'json':
			self.doc = json.loads(data)

		self.feed = feeds.FeedParserAtom()

	def raw(self, html, expr):
		if self.rule['mode'] == 'xpath':
			print 1, toclass(expr)
			return html.xpath(toclass(expr))

		elif self.rule['mode'] == 'json':
			a = [html]
			b = []
			for x in expr.strip(".").split("."):
				match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
				for elem in a:
					if isinstance(elem, dict):
						kids = elem.get(match[0])
						if kids is None:
							pass
						elif isinstance(kids, list):
							[b.append(i) for i in kids]
						elif isinstance(kids, basestring):
							b.append(kids.replace('\n', '<br/>'))
						else:
							b.append(kids)

				if match[1] is None:
					a = b
				else:
					if len(b)-1 >= int(match[1]):
						a = [b[int(match[1])]]
					else:
						a = []
				b = []
			return a

	def strings(self, html, expr):
		if self.rule['mode'] == 'xpath':
			out = []
			for match in self.raw(html, expr):
				if isinstance(match, basestring):
					out.append(match)
				elif isinstance(match, lxml.html.HtmlElement):
					out.append(lxml.html.tostring(match))
			return out

		elif self.rule['mode'] == 'json':
			return self.raw(html, expr)

	def string(self, html, expr):
		getter = lambda x: self.strings(html, x)
		return formatString(self.rule[expr], getter)

	def build(self):
		if 'title' in self.rule:
			self.feed.title = self.string(self.doc, 'title')

		if 'items' in self.rule:
			matches = self.raw(self.doc, self.rule['items'])
			if matches and len(matches):
				for item in matches:
					feedItem = {}

					if 'item_title' in self.rule:
						feedItem['title'] = self.string(item, 'item_title')
					if 'item_link' in self.rule:
						url = self.string(item, 'item_link')
						url = urlparse.urljoin(self.link, url)
						feedItem['link'] = url
					if 'item_desc' in self.rule:
						feedItem['desc'] = self.string(item, 'item_desc')
					if 'item_content' in self.rule:
						feedItem['content'] = self.string(item, 'item_content')
					if 'item_time' in self.rule:
						feedItem['updated'] = self.string(item, 'item_time')

					self.feed.items.append(feedItem)
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`#!/usr/bin/env python`

			`from ConfigParser import ConfigParser`
			`from fnmatch import fnmatch`
			`import feeds`
Add support for JSON APIs in feedify 2013-10-21 19:28:43 +00:00			`import morss`
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`import re`

			`import urllib2`
			`import lxml.html`
Add support for JSON APIs in feedify 2013-10-21 19:28:43 +00:00			`import json`
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`import urlparse`

			`def toclass(query):`
			`pattern = r'\[class=([^\]]+)\]'`
			`repl = r'[@class and contains(concat(" ", normalize-space(@class), " "), " \1 ")]'`
			`return re.sub(pattern, repl, query)`

Improve feedify string grabbing 2013-10-01 18:18:55 +00:00			`def getRule(link):`
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`config = ConfigParser()`
			`config.read('feedify.ini')`

			`for section in config.sections():`
			`values = dict(config.items(section))`
			`values['path'] = values['path'].split('\n')[1:]`
			`for path in values['path']:`
			`if fnmatch(link, path):`
			`return values`
			`return False`

			`def supported(link):`
			`return getRule(link) is not False`

Add support for JSON APIs in feedify 2013-10-21 19:28:43 +00:00			`def formatString(string, getter, error=False):`
			`out = ""`
			`char = string[0]`

			`follow = string[1:]`

			`if char == '"':`
			`match = follow.partition('"')`
			`out = match[0]`
			`if len(match) >= 2:`
			`next = match[2]`
			`else:`
			`next = None`
			`elif char == '{':`
			`match = follow.partition('}')`
			`try:`
			`test = formatString(match[0], getter, True)`
			`except ValueError, KeyError:`
			`pass`
			`else:`
			`out = test`

			`next = match[2]`
			`elif char == ' ':`
			`next = follow`
			`elif re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string):`
			`match = re.search(r'^([^{}<>" ]+)(?:<"([^>]+)">)?(.*)$', string).groups()`
			`rawValue = getter(match[0])`
			`print repr(rawValue)`
			`if not isinstance(rawValue, basestring):`
			`if match[1] is not None:`
			`out = match[1].join(rawValue)`
			`else:`
			`out = ''.join(rawValue)`
			`if not out and error:`
			`raise ValueError`
			`next = match[2]`
			`else:`
			`raise ValueError('bogus string')`

			`if next is not None and len(next):`
			`return out + formatString(next, getter, error)`
Add feedify, and use it in morss 2013-09-25 10:36:21 +00:00			`else:`
Add support for JSON APIs in feedify 2013-10-21 19:28:43 +00:00			`return out`

			`class Builder(object):`
			`def __init__(self, link, data=None):`
			`self.link = link`

			`if data is None:`
			`data = urllib2.urlopen(link).read()`
			`self.data = data`

			`self.rule = getRule(link)`

			`if self.rule['mode'] == 'xpath':`
			`self.data = morss.decodeHTML(self.data)`
			`self.doc = lxml.html.fromstring(self.data)`
			`elif self.rule['mode'] == 'json':`
			`self.doc = json.loads(data)`

			`self.feed = feeds.FeedParserAtom()`

			`def raw(self, html, expr):`
			`if self.rule['mode'] == 'xpath':`
			`print 1, toclass(expr)`
			`return html.xpath(toclass(expr))`

			`elif self.rule['mode'] == 'json':`
			`a = [html]`
			`b = []`
			`for x in expr.strip(".").split("."):`
			`match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()`
			`for elem in a:`
			`if isinstance(elem, dict):`
			`kids = elem.get(match[0])`
			`if kids is None:`
			`pass`
			`elif isinstance(kids, list):`
			`[b.append(i) for i in kids]`
			`elif isinstance(kids, basestring):`
			`b.append(kids.replace('\n', '<br/>'))`
			`else:`
			`b.append(kids)`

			`if match[1] is None:`
			`a = b`
			`else:`
			`if len(b)-1 >= int(match[1]):`
			`a = [b[int(match[1])]]`
			`else:`
			`a = []`
			`b = []`
			`return a`

			`def strings(self, html, expr):`
			`if self.rule['mode'] == 'xpath':`
			`out = []`
			`for match in self.raw(html, expr):`
			`if isinstance(match, basestring):`
			`out.append(match)`
			`elif isinstance(match, lxml.html.HtmlElement):`
			`out.append(lxml.html.tostring(match))`
			`return out`

			`elif self.rule['mode'] == 'json':`
			`return self.raw(html, expr)`

			`def string(self, html, expr):`
			`getter = lambda x: self.strings(html, x)`
			`return formatString(self.rule[expr], getter)`

			`def build(self):`
			`if 'title' in self.rule:`
			`self.feed.title = self.string(self.doc, 'title')`

			`if 'items' in self.rule:`
			`matches = self.raw(self.doc, self.rule['items'])`
			`if matches and len(matches):`
			`for item in matches:`
			`feedItem = {}`

			`if 'item_title' in self.rule:`
			`feedItem['title'] = self.string(item, 'item_title')`
			`if 'item_link' in self.rule:`
			`url = self.string(item, 'item_link')`
			`url = urlparse.urljoin(self.link, url)`
			`feedItem['link'] = url`
			`if 'item_desc' in self.rule:`
			`feedItem['desc'] = self.string(item, 'item_desc')`
			`if 'item_content' in self.rule:`
			`feedItem['content'] = self.string(item, 'item_content')`
			`if 'item_time' in self.rule:`
			`feedItem['updated'] = self.string(item, 'item_time')`

			`self.feed.items.append(feedItem)`