parent
e3f525ff2a
commit
245ba99ae9
35
morss.py
35
morss.py
|
@ -284,9 +284,14 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
|
||||||
|
|
||||||
return self.parent.open(new, timeout=req.timeout)
|
return self.parent.open(new, timeout=req.timeout)
|
||||||
|
|
||||||
# decode
|
# encoding
|
||||||
if self.decode:
|
enc = detEncoding(data, resp)
|
||||||
data = decodeHTML(data, resp)
|
|
||||||
|
if enc:
|
||||||
|
data = data.decode(enc, 'replace')
|
||||||
|
|
||||||
|
if not self.decode:
|
||||||
|
data = data.encode(enc)
|
||||||
|
|
||||||
fp = StringIO(data)
|
fp = StringIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
|
@ -298,21 +303,21 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
def decodeHTML(data, con=None):
|
def detEncoding(data, con=None):
|
||||||
if con is not None and con.headers.getparam('charset'):
|
if con is not None and con.headers.getparam('charset'):
|
||||||
log('header')
|
log('header')
|
||||||
enc = con.headers.getparam('charset')
|
return con.headers.getparam('charset')
|
||||||
else:
|
|
||||||
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
|
|
||||||
if match:
|
|
||||||
log('meta.re')
|
|
||||||
enc = match.groups()[0]
|
|
||||||
else:
|
|
||||||
log('chardet')
|
|
||||||
enc = chardet.detect(data)['encoding']
|
|
||||||
|
|
||||||
log(enc)
|
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
|
||||||
return data.decode(enc, 'replace') if enc else data
|
if match:
|
||||||
|
log('meta.re')
|
||||||
|
return match.groups()[0]
|
||||||
|
|
||||||
|
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||||
|
if match:
|
||||||
|
return match.groups()[0].lower()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def Fix(item, feedurl='/'):
|
def Fix(item, feedurl='/'):
|
||||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||||
|
|
Loading…
Reference in New Issue