Fix bad characters detection
Now works with any encoding, no longer restricted to utf-8. Uses regex to find encoding (not perfect, but rather fast, since it's used on a substring)
This commit is contained in:
		
							
								
								
									
										9
									
								
								feeds.py
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								feeds.py
									
									
									
									
									
								
							@@ -42,10 +42,17 @@ class FeedException(Exception):
 | 
				
			|||||||
	pass
 | 
						pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse(data):
 | 
					def parse(data):
 | 
				
			||||||
	data = data.decode('utf-8', 'replace').encode('utf-8')
 | 
						# encoding
 | 
				
			||||||
 | 
						match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
 | 
				
			||||||
 | 
						if match:
 | 
				
			||||||
 | 
							enc = match.groups()[0].lower()
 | 
				
			||||||
 | 
							data = data.decode(enc, 'ignore').encode(enc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# parse
 | 
				
			||||||
	parser = etree.XMLParser(recover=True)
 | 
						parser = etree.XMLParser(recover=True)
 | 
				
			||||||
	doc = etree.fromstring(data, parser)
 | 
						doc = etree.fromstring(data, parser)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						# rss
 | 
				
			||||||
	match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
						match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
 | 
				
			||||||
	if len(match):
 | 
						if len(match):
 | 
				
			||||||
		mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
							mtable = {	'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user