crawler: shift gzip & encoding-fix to intermediary handler
parent
9ab2e488ef
commit
0efb096fa7
|
@ -262,29 +262,17 @@ def UnGzip(data):
|
||||||
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(BaseHandler):
|
class GZIPHandler(RespDataHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
return req
|
return req
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def data_response(self, req, resp, data):
|
||||||
if 200 <= resp.code < 300:
|
if 200 <= resp.code < 300:
|
||||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
data = UnGzip(data)
|
|
||||||
|
|
||||||
resp.headers['Content-Encoding'] = 'identity'
|
resp.headers['Content-Encoding'] = 'identity'
|
||||||
|
|
||||||
fp = BytesIO(data)
|
return UnGzip(data)
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
https_request = http_request
|
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(data, resp=None):
|
def detect_encoding(data, resp=None):
|
||||||
|
@ -321,28 +309,9 @@ def detect_raw_encoding(data, resp=None):
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
|
||||||
|
|
||||||
class EncodingFixHandler(BaseHandler):
|
class EncodingFixHandler(RespStrHandler):
|
||||||
def __init__(self, encoding=None):
|
def str_response(self, req, resp, data_str):
|
||||||
self.encoding = encoding
|
return data_str
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
|
||||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
|
||||||
if 200 <= resp.code < 300 and maintype == 'text':
|
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
enc = self.encoding or detect_encoding(data, resp)
|
|
||||||
|
|
||||||
data = data.decode(enc, 'replace')
|
|
||||||
data = data.encode(enc)
|
|
||||||
|
|
||||||
fp = BytesIO(data)
|
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
|
|
||||||
|
|
||||||
class UAHandler(BaseHandler):
|
class UAHandler(BaseHandler):
|
||||||
|
|
Loading…
Reference in New Issue