Improve broken XML support
TPB feed is a good example <http://rss.thepiratebay.sx/blog>. Now supports ampersand in feed, using the "recover" mode in etree.parse. Broken utf-8 strings in feed are now also supported.
This commit is contained in:
		
							
								
								
									
										5
									
								
								feeds.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								feeds.py
									
									
									
									
									
								
							@@ -42,7 +42,10 @@ class FeedException(Exception):
 | 
			
		||||
	pass
 | 
			
		||||
 | 
			
		||||
def parse(data):
 | 
			
		||||
	doc = etree.fromstring(data)
 | 
			
		||||
	data = data.decode('utf-8', 'replace').encode('utf-8')
 | 
			
		||||
	parser = etree.XMLParser(recover=True)
 | 
			
		||||
	doc = etree.fromstring(data, parser)
 | 
			
		||||
 | 
			
		||||
	match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
			
		||||
	if len(match):
 | 
			
		||||
		mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user