Add feeds.py

This is a huge change. Feed parsing is now done in a separate file, much cleaner. The code of the lib tends to repeat itself a lot though. It should be possible to improve it. Code should be more stable.
master
pictuga 2013-07-14 18:25:49 +02:00
parent 6e891ef6ff
commit 8ac7d8b282
1 changed files with 354 additions and 0 deletions

354
feeds.py 100644
View File

@ -0,0 +1,354 @@
#!/usr/bin/env python
from lxml import etree
import re
NSMAP = {'atom': 'http://www.w3.org/2005/Atom',
'atom03': 'http://purl.org/atom/ns#',
'media': 'http://search.yahoo.com/mrss/',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'slash': 'http://purl.org/rss/1.0/modules/slash/',
'dc': 'http://purl.org/dc/elements/1.1/',
'content': 'http://purl.org/rss/1.0/modules/content/',
'rssfake': 'http://purl.org/rss/1.0/'}
def load(url):
import urllib2
d = urllib2.urlopen(url).read()
return parse(d)
def tagNS(tag, nsmap=NSMAP):
match = re.search(r'^\{([^\}]+)\}(.*)$', tag)
if match:
match = match.groups()
for (key, url) in nsmap.iteritems():
if url == match[0]:
return "%s:%s" % (key, match[1].lower())
else:
match = re.search(r'^([^:]+):([^:]+)$', tag)
if match:
match = match.groups()
if match[0] in nsmap:
return "{%s}%s" % (nsmap[match[0]], match[1].lower())
return tag
def innerHTML(xml):
return (xml.text or '') + ''.join([etree.tostring(child) for child in xml.iterchildren()])
def cleanNode(xml):
[xml.remove(child) for child in xml.iterchildren()]
class FeedException(Exception):
pass
def parse(data):
doc = etree.fromstring(data)
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
if len(match):
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
'atom03:feed': FeedParserAtom, 'atom:feed': FeedParserAtom }
match = match[0]
tag = tagNS(match.tag)
if tag in mtable:
return mtable[tag](doc, tag)
raise FeedException('unknow feed type')
class FeedBase(object):
def xpath(self, path):
""" Test xpath rule on xml tree """
return self.root.xpath(path, namespaces=NSMAP)
def xget(self, path):
""" Returns the 1st xpath match """
match = self.xpath(path)
if len(match):
return match[0]
else:
return None
def xval(self, path):
""" Returns the .text of the 1st match """
match = self.xget(path)
if match is not None:
return match.text
else:
return ""
def xgetCreate(self, table):
""" Returns an element, and creates it when not present """
tag = table[self.tag]
match = self.xget(tag)
if match is not None:
return match
else:
element = etree.Element(tagNS(tag))
self.root.append(element)
return element
def tostring(self, **k):
""" Returns string using lxml. Arguments passed to tostring """
return etree.tostring(self.xml, pretty_print=True, **k)
class FeedParser(FeedBase):
FeedItem = 'FeedItem'
mimetype = 'application/xml'
def __init__(self, xml, tag):
self.xml = xml
self.root = self.xml.xpath("//atom03:feed|//atom:feed|//channel|//rssfake:channel", namespaces=NSMAP)[0]
self.tag = tag
self._items = {} # id(xml) => FeedItem
def getTitle(self):
return ""
def setTitle(self, value):
pass
def getDesc(self):
pass
def setDesc(self, value):
pass
def getItems(self):
return []
def setItems(self, value):
pass
title = property(
fget=lambda self: self.getTitle(),
fset=lambda self,v: self.setTitle(v))
description = desc = property(
fget=lambda self: self.getDesc(),
fset=lambda self,v: self.setDesc(v))
items = property(
fget=lambda self: self._getItems(),
fset=lambda self,v: self.setItems(v))
def _getItems(self):
items = self.getItems()
out = []
for item in items:
if id(item) in self._items:
out.append(self._items[id(item)])
else:
new = eval(self.FeedItem)(item, self.tag)
self._items[id(item)] = new
out.append(new)
return out
def __getitem__(self, key):
return self.items[key]
def __delitem__(self, key):
item = self.getItems()[key]
if id(item) in self._items:
self._items[id(item)].remove()
del self._items[id(item)]
else:
item.getparent().remove(item)
def __len__(self):
return len(self.getItems())
class FeedParserRSS(FeedParser):
"""
RSS Parser
"""
FeedItem = 'FeedItemRSS'
mimetype = 'application/rss+xml'
def getTitle(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
table = { 'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
table = { 'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element.text = value
def getItems(self):
return self.xpath('rssfake:item|item')
class FeedParserAtom(FeedParser):
"""
Atom Parser
"""
FeedItem = 'FeedItemAtom'
mimetype = 'application/atom+xml'
def getTitle(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
table = { 'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('atom:subtitle|atom03:subtitle')
def setDesc(self, value):
table = { 'atom:feed': 'atom:subtitle',
'atom03:feed': 'atom03:subtitle'}
element = self.xgetCreate(table)
element.text = value
def getItems(self):
return self.xpath('atom:entry|atom03:entry')
class FeedItem(FeedBase):
def __init__(self, xml, tag):
self.root = self.xml = xml
self.tag = tag
def getTitle(self):
return ""
def setTitle(self):
pass
def getDesc(self):
return ""
def setDesc(self, value):
pass
def getContent(self):
return ""
def setContent(self, value):
pass
title = property(
fget=lambda self: self.getTitle(),
fset=lambda self,v: self.setTitle(v))
link = property(
fget=lambda self: self.getLink(),
fset=lambda self,v: self.setLink(v))
description = desc = property(
fget=lambda self: self.getDesc(),
fset=lambda self,v: self.setDesc(v))
content = property(
fget=lambda self: self.getContent(),
fset=lambda self,v: self.setContent(v))
def remove(self):
self.xml.getparent().remove(self.xml)
class FeedItemRSS(FeedItem):
def getTitle(self):
return self.xval('rssfake:title|title')
def setTitle(self, value):
table = { 'rdf:rdf': 'rssfake:title',
'channel': 'title'}
element = self.xgetCreate(table)
element.text = value
def getLink(self):
return self.xval('rssfake:link|link')
def setLink(self, value):
table = { 'rdf:rdf': 'rssfake:link',
'channel': 'link'}
element = self.xgetCreate(table)
element.text = value
def getDesc(self):
return self.xval('rssfake:description|description')
def setDesc(self, value):
table = { 'rdf:rdf': 'rssfake:description',
'channel': 'description'}
element = self.xgetCreate(table)
element.text = value
def getContent(self):
return self.xval('content:encoded')
def setContent(self, value):
table = { 'rdf:rdf': 'content:encoded',
'channel': 'content:encoded'}
element = self.xgetCreate(table)
element.text = value
class FeedItemAtom(FeedItem):
def getTitle(self):
return self.xval('atom:title|atom03:title')
def setTitle(self, value):
table = { 'atom:feed': 'atom:title',
'atom03:feed': 'atom03:title'}
element = self.xgetCreate(table)
element.text = value
def getLink(self):
return self.xget('atom:link|atom03:link').get('href', '')
def setLink(self, value):
table = { 'atom:feed': 'atom:link',
'atom03:feed': 'atom03:link'}
element = self.xgetCreate(table)
element.attrib['href'] = value
def getDesc(self):
# default "type" is "text"
element = self.xget('atom:summary|atom03:summary')
if element is not None:
return innerHTML(element)
else:
return ""
def setDesc(self, value):
table = { 'atom:feed': 'atom:summary',
'atom03:feed': 'atom03:summary'}
element = self.xgetCreate(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
element.attrib['type'] = 'html'
element.text = value
def getContent(self):
element = self.xget('atom:content|atom03:content')
if element is not None:
return innerHTML(element)
else:
return ""
def setContent(self, value):
table = { 'atom:feed': 'atom:content',
'atom03:feed': 'atom03:content'}
element = self.xgetCreate(table)
if element.attrib.get('type', '') == 'xhtml':
cleanNode(element)
element.attrib['type'] = 'html'
element.text = value