Add feeds.py
This is a huge change. Feed parsing is now done in a separate file, much cleaner. The code of the lib tends to repeat itself a lot though. It should be possible to improve it. Code should be more stable.
This commit is contained in:
		
							
								
								
									
										354
									
								
								feeds.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										354
									
								
								feeds.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,354 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
 | 
			
		||||
from lxml import etree
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
NSMAP = {'atom':	'http://www.w3.org/2005/Atom',
 | 
			
		||||
	'atom03':	'http://purl.org/atom/ns#',
 | 
			
		||||
	'media':	'http://search.yahoo.com/mrss/',
 | 
			
		||||
	'rdf':		'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 | 
			
		||||
	'slash':	'http://purl.org/rss/1.0/modules/slash/',
 | 
			
		||||
	'dc':		'http://purl.org/dc/elements/1.1/',
 | 
			
		||||
	'content':	'http://purl.org/rss/1.0/modules/content/',
 | 
			
		||||
	'rssfake':	'http://purl.org/rss/1.0/'}
 | 
			
		||||
 | 
			
		||||
def load(url):
 | 
			
		||||
	import urllib2
 | 
			
		||||
	d = urllib2.urlopen(url).read()
 | 
			
		||||
	return parse(d)
 | 
			
		||||
 | 
			
		||||
def tagNS(tag, nsmap=NSMAP):
 | 
			
		||||
	match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
 | 
			
		||||
	if match:
 | 
			
		||||
		match = match.groups()
 | 
			
		||||
		for (key, url) in nsmap.iteritems():
 | 
			
		||||
			if url == match[0]:
 | 
			
		||||
				return "%s:%s" % (key, match[1].lower())
 | 
			
		||||
	else:
 | 
			
		||||
		match = re.search(r'^([^:]+):([^:]+)$', tag)
 | 
			
		||||
		if match:
 | 
			
		||||
			match = match.groups()
 | 
			
		||||
			if match[0] in nsmap:
 | 
			
		||||
				return "{%s}%s" % (nsmap[match[0]], match[1].lower())
 | 
			
		||||
	return tag
 | 
			
		||||
 | 
			
		||||
def innerHTML(xml):
 | 
			
		||||
	return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
 | 
			
		||||
 | 
			
		||||
def cleanNode(xml):
 | 
			
		||||
	[xml.remove(child) for child in xml.iterchildren()]
 | 
			
		||||
 | 
			
		||||
class FeedException(Exception):
 | 
			
		||||
	pass
 | 
			
		||||
 | 
			
		||||
def parse(data):
 | 
			
		||||
	doc = etree.fromstring(data)
 | 
			
		||||
	match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
			
		||||
	if len(match):
 | 
			
		||||
		mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
			
		||||
					'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
 | 
			
		||||
		match = match[0]
 | 
			
		||||
		tag = tagNS(match.tag)
 | 
			
		||||
		if tag in mtable:
 | 
			
		||||
			return mtable[tag](doc, tag)
 | 
			
		||||
 | 
			
		||||
	raise FeedException('unknow feed type')
 | 
			
		||||
 | 
			
		||||
class FeedBase(object):
 | 
			
		||||
	def xpath(self, path):
 | 
			
		||||
		""" Test xpath rule on xml tree """
 | 
			
		||||
		return self.root.xpath(path, namespaces=NSMAP)
 | 
			
		||||
 | 
			
		||||
	def xget(self, path):
 | 
			
		||||
		""" Returns the 1st xpath match """
 | 
			
		||||
		match = self.xpath(path)
 | 
			
		||||
		if len(match):
 | 
			
		||||
			return match[0]
 | 
			
		||||
		else:
 | 
			
		||||
			return None
 | 
			
		||||
 | 
			
		||||
	def xval(self, path):
 | 
			
		||||
		""" Returns the .text of the 1st match """
 | 
			
		||||
		match = self.xget(path)
 | 
			
		||||
		if match is not None:
 | 
			
		||||
			return match.text
 | 
			
		||||
		else:
 | 
			
		||||
			return ""
 | 
			
		||||
 | 
			
		||||
	def xgetCreate(self, table):
 | 
			
		||||
		""" Returns an element, and creates it when not present """
 | 
			
		||||
		tag = table[self.tag]
 | 
			
		||||
		match = self.xget(tag)
 | 
			
		||||
		if match is not None:
 | 
			
		||||
			return match
 | 
			
		||||
		else:
 | 
			
		||||
			element = etree.Element(tagNS(tag))
 | 
			
		||||
			self.root.append(element)
 | 
			
		||||
			return element
 | 
			
		||||
 | 
			
		||||
	def tostring(self, **k):
 | 
			
		||||
		""" Returns string using lxml. Arguments passed to tostring """
 | 
			
		||||
		return etree.tostring(self.xml, pretty_print=True, **k)
 | 
			
		||||
 | 
			
		||||
class FeedParser(FeedBase):
 | 
			
		||||
	FeedItem = 'FeedItem'
 | 
			
		||||
	mimetype = 'application/xml'
 | 
			
		||||
 | 
			
		||||
	def __init__(self, xml, tag):
 | 
			
		||||
		self.xml = xml
 | 
			
		||||
		self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
 | 
			
		||||
		self.tag = tag
 | 
			
		||||
		self._items = {} # id(xml) => FeedItem
 | 
			
		||||
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return ""
 | 
			
		||||
 | 
			
		||||
	def setTitle(self, value):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getItems(self):
 | 
			
		||||
		return []
 | 
			
		||||
 | 
			
		||||
	def setItems(self, value):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
	title = property(
 | 
			
		||||
		fget=lambda self:	self.getTitle(),
 | 
			
		||||
		fset=lambda self,v: self.setTitle(v))
 | 
			
		||||
	description = desc = property(
 | 
			
		||||
		fget=lambda self:	self.getDesc(),
 | 
			
		||||
		fset=lambda self,v: self.setDesc(v))
 | 
			
		||||
	items = property(
 | 
			
		||||
		fget=lambda self:	self._getItems(),
 | 
			
		||||
		fset=lambda self,v: self.setItems(v))
 | 
			
		||||
 | 
			
		||||
	def _getItems(self):
 | 
			
		||||
		items = self.getItems()
 | 
			
		||||
		out = []
 | 
			
		||||
		for item in items:
 | 
			
		||||
			if id(item) in self._items:
 | 
			
		||||
				out.append(self._items[id(item)])
 | 
			
		||||
			else:
 | 
			
		||||
				new = eval(self.FeedItem)(item, self.tag)
 | 
			
		||||
				self._items[id(item)] = new
 | 
			
		||||
				out.append(new)
 | 
			
		||||
		return out
 | 
			
		||||
 | 
			
		||||
	def __getitem__(self, key):
 | 
			
		||||
		return self.items[key]
 | 
			
		||||
 | 
			
		||||
	def __delitem__(self, key):
 | 
			
		||||
		item = self.getItems()[key]
 | 
			
		||||
		if id(item) in self._items:
 | 
			
		||||
			self._items[id(item)].remove()
 | 
			
		||||
			del self._items[id(item)]
 | 
			
		||||
		else:
 | 
			
		||||
			item.getparent().remove(item)
 | 
			
		||||
 | 
			
		||||
	def __len__(self):
 | 
			
		||||
		return len(self.getItems())
 | 
			
		||||
 | 
			
		||||
class FeedParserRSS(FeedParser):
 | 
			
		||||
	"""
 | 
			
		||||
	RSS Parser
 | 
			
		||||
	"""
 | 
			
		||||
	FeedItem = 'FeedItemRSS'
 | 
			
		||||
	mimetype = 'application/rss+xml'
 | 
			
		||||
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return self.xval('rssfake:title|title')
 | 
			
		||||
 | 
			
		||||
	def setTitle(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'rssfake:title',
 | 
			
		||||
					'channel':	'title'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		return self.xval('rssfake:description|description')
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'rssfake:description',
 | 
			
		||||
					'channel':	'description'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getItems(self):
 | 
			
		||||
		return self.xpath('rssfake:item|item')
 | 
			
		||||
 | 
			
		||||
class FeedParserAtom(FeedParser):
 | 
			
		||||
	"""
 | 
			
		||||
	Atom Parser
 | 
			
		||||
	"""
 | 
			
		||||
	FeedItem = 'FeedItemAtom'
 | 
			
		||||
	mimetype = 'application/atom+xml'
 | 
			
		||||
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return self.xval('atom:title|atom03:title')
 | 
			
		||||
 | 
			
		||||
	def setTitle(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:title',
 | 
			
		||||
					'atom03:feed':	'atom03:title'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		return self.xval('atom:subtitle|atom03:subtitle')
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:subtitle',
 | 
			
		||||
					'atom03:feed':	'atom03:subtitle'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getItems(self):
 | 
			
		||||
		return self.xpath('atom:entry|atom03:entry')
 | 
			
		||||
 | 
			
		||||
class FeedItem(FeedBase):
 | 
			
		||||
	def __init__(self, xml, tag):
 | 
			
		||||
		self.root = self.xml = xml
 | 
			
		||||
		self.tag = tag
 | 
			
		||||
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return ""
 | 
			
		||||
 | 
			
		||||
	def setTitle(self):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		return ""
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getContent(self):
 | 
			
		||||
		return ""
 | 
			
		||||
 | 
			
		||||
	def setContent(self, value):
 | 
			
		||||
		pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	title = property(
 | 
			
		||||
		fget=lambda self:	self.getTitle(),
 | 
			
		||||
		fset=lambda self,v: self.setTitle(v))
 | 
			
		||||
	link = property(
 | 
			
		||||
		fget=lambda self:	self.getLink(),
 | 
			
		||||
		fset=lambda self,v: self.setLink(v))
 | 
			
		||||
	description = desc = property(
 | 
			
		||||
		fget=lambda self:	self.getDesc(),
 | 
			
		||||
		fset=lambda self,v: self.setDesc(v))
 | 
			
		||||
	content = property(
 | 
			
		||||
		fget=lambda self:	self.getContent(),
 | 
			
		||||
		fset=lambda self,v: self.setContent(v))
 | 
			
		||||
 | 
			
		||||
	def remove(self):
 | 
			
		||||
		self.xml.getparent().remove(self.xml)
 | 
			
		||||
 | 
			
		||||
class FeedItemRSS(FeedItem):
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return self.xval('rssfake:title|title')
 | 
			
		||||
 | 
			
		||||
	def setTitle(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'rssfake:title',
 | 
			
		||||
					'channel':	'title'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getLink(self):
 | 
			
		||||
		return self.xval('rssfake:link|link')
 | 
			
		||||
 | 
			
		||||
	def setLink(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'rssfake:link',
 | 
			
		||||
					'channel':	'link'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		return self.xval('rssfake:description|description')
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'rssfake:description',
 | 
			
		||||
					'channel':	'description'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getContent(self):
 | 
			
		||||
		return self.xval('content:encoded')
 | 
			
		||||
 | 
			
		||||
	def setContent(self, value):
 | 
			
		||||
		table = {	'rdf:rdf':	'content:encoded',
 | 
			
		||||
					'channel':	'content:encoded'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
class FeedItemAtom(FeedItem):
 | 
			
		||||
	def getTitle(self):
 | 
			
		||||
		return self.xval('atom:title|atom03:title')
 | 
			
		||||
 | 
			
		||||
	def setTitle(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:title',
 | 
			
		||||
					'atom03:feed':	'atom03:title'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getLink(self):
 | 
			
		||||
		return self.xget('atom:link|atom03:link').get('href', '')
 | 
			
		||||
 | 
			
		||||
	def setLink(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:link',
 | 
			
		||||
					'atom03:feed':	'atom03:link'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		element.attrib['href'] = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getDesc(self):
 | 
			
		||||
		# default "type" is "text"
 | 
			
		||||
		element = self.xget('atom:summary|atom03:summary')
 | 
			
		||||
		if element is not None:
 | 
			
		||||
			return innerHTML(element)
 | 
			
		||||
		else:
 | 
			
		||||
			return ""
 | 
			
		||||
 | 
			
		||||
	def setDesc(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:summary',
 | 
			
		||||
					'atom03:feed':	'atom03:summary'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		if element.attrib.get('type', '') == 'xhtml':
 | 
			
		||||
			cleanNode(element)
 | 
			
		||||
		element.attrib['type'] = 'html'
 | 
			
		||||
		element.text = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def getContent(self):
 | 
			
		||||
		element = self.xget('atom:content|atom03:content')
 | 
			
		||||
		if element is not None:
 | 
			
		||||
			return innerHTML(element)
 | 
			
		||||
		else:
 | 
			
		||||
			return ""
 | 
			
		||||
 | 
			
		||||
	def setContent(self, value):
 | 
			
		||||
		table = {	'atom:feed':	'atom:content',
 | 
			
		||||
					'atom03:feed':	'atom03:content'}
 | 
			
		||||
		element = self.xgetCreate(table)
 | 
			
		||||
		if element.attrib.get('type', '') == 'xhtml':
 | 
			
		||||
			cleanNode(element)
 | 
			
		||||
		element.attrib['type'] = 'html'
 | 
			
		||||
		element.text = value
 | 
			
		||||
		Reference in New Issue
	
	Block a user