Always clean up downloads' enconding
TPB, fuck ye
This commit is contained in:
		
							
								
								
									
										35
									
								
								morss.py
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								morss.py
									
									
									
									
									
								
							@@ -284,9 +284,14 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
					return self.parent.open(new, timeout=req.timeout)
 | 
										return self.parent.open(new, timeout=req.timeout)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			# decode
 | 
								# encoding
 | 
				
			||||||
			if self.decode:
 | 
								enc = detEncoding(data, resp)
 | 
				
			||||||
				data = decodeHTML(data, resp)
 | 
					
 | 
				
			||||||
 | 
								if enc:
 | 
				
			||||||
 | 
									data = data.decode(enc, 'replace')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
									if not self.decode:
 | 
				
			||||||
 | 
										data = data.encode(enc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		fp = StringIO(data)
 | 
							fp = StringIO(data)
 | 
				
			||||||
		old_resp = resp
 | 
							old_resp = resp
 | 
				
			||||||
@@ -298,21 +303,21 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
 | 
				
			|||||||
	https_response = http_response
 | 
						https_response = http_response
 | 
				
			||||||
	https_request = http_request
 | 
						https_request = http_request
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def decodeHTML(data, con=None):
 | 
					def detEncoding(data, con=None):
 | 
				
			||||||
	if con is not None and con.headers.getparam('charset'):
 | 
						if con is not None and con.headers.getparam('charset'):
 | 
				
			||||||
		log('header')
 | 
							log('header')
 | 
				
			||||||
		enc = con.headers.getparam('charset')
 | 
							return con.headers.getparam('charset')
 | 
				
			||||||
	else:
 | 
					 | 
				
			||||||
		match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
 | 
					 | 
				
			||||||
		if match:
 | 
					 | 
				
			||||||
			log('meta.re')
 | 
					 | 
				
			||||||
			enc = match.groups()[0]
 | 
					 | 
				
			||||||
		else:
 | 
					 | 
				
			||||||
			log('chardet')
 | 
					 | 
				
			||||||
			enc = chardet.detect(data)['encoding']
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	log(enc)
 | 
						match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
 | 
				
			||||||
	return data.decode(enc, 'replace') if enc else data
 | 
						if match:
 | 
				
			||||||
 | 
							log('meta.re')
 | 
				
			||||||
 | 
							return match.groups()[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
 | 
				
			||||||
 | 
						if match:
 | 
				
			||||||
 | 
							return match.groups()[0].lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def Fix(item, feedurl='/'):
 | 
					def Fix(item, feedurl='/'):
 | 
				
			||||||
	""" Improves feed items (absolute links, resolve feedburner links, etc) """
 | 
						""" Improves feed items (absolute links, resolve feedburner links, etc) """
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user