Improve broken XML support
TPB feed is a good example <http://rss.thepiratebay.sx/blog>. Now supports ampersand in feed, using the "recover" mode in etree.parse. Broken utf-8 strings in feed are now also supported.master
parent
5ebd84ee55
commit
1b7fdad6a8
5
feeds.py
5
feeds.py
|
@ -42,7 +42,10 @@ class FeedException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def parse(data):
|
def parse(data):
|
||||||
doc = etree.fromstring(data)
|
data = data.decode('utf-8', 'replace').encode('utf-8')
|
||||||
|
parser = etree.XMLParser(recover=True)
|
||||||
|
doc = etree.fromstring(data, parser)
|
||||||
|
|
||||||
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
|
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
|
||||||
if len(match):
|
if len(match):
|
||||||
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
|
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
|
||||||
|
|
Loading…
Reference in New Issue