Improve broken XML support
TPB feed is a good example <http://rss.thepiratebay.sx/blog>. Now supports ampersand in feed, using the "recover" mode in etree.parse. Broken utf-8 strings in feed are now also supported.
This commit is contained in:
		
							
								
								
									
										5
									
								
								feeds.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								feeds.py
									
									
									
									
									
								
							@@ -42,7 +42,10 @@ class FeedException(Exception):
 | 
				
			|||||||
	pass
 | 
						pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse(data):
 | 
					def parse(data):
 | 
				
			||||||
	doc = etree.fromstring(data)
 | 
						data = data.decode('utf-8', 'replace').encode('utf-8')
 | 
				
			||||||
 | 
						parser = etree.XMLParser(recover=True)
 | 
				
			||||||
 | 
						doc = etree.fromstring(data, parser)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
						match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
				
			||||||
	if len(match):
 | 
						if len(match):
 | 
				
			||||||
		mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
							mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user