crawler: shift gzip & encoding-fix to intermediary handler

master
pictuga 2020-10-30 22:16:51 +01:00
parent 9ab2e488ef
commit 0efb096fa7
1 changed files with 6 additions and 37 deletions

View File

@ -262,29 +262,17 @@ def UnGzip(data):
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
class GZIPHandler(BaseHandler):
class GZIPHandler(RespDataHandler):
def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip')
return req
def http_response(self, req, resp):
def data_response(self, req, resp, data):
if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read()
data = UnGzip(data)
resp.headers['Content-Encoding'] = 'identity'
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
return UnGzip(data)
def detect_encoding(data, resp=None):
@ -321,28 +309,9 @@ def detect_raw_encoding(data, resp=None):
return 'utf-8'
class EncodingFixHandler(BaseHandler):
def __init__(self, encoding=None):
self.encoding = encoding
def http_response(self, req, resp):
maintype = resp.info().get('Content-Type', '').split('/')[0]
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
enc = self.encoding or detect_encoding(data, resp)
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class EncodingFixHandler(RespStrHandler):
def str_response(self, req, resp, data_str):
return data_str
class UAHandler(BaseHandler):