crawler: fix truncated gzip reader

For python 3
master
pictuga 2017-11-04 12:07:08 +01:00
parent d39d0f4cae
commit a8c2df7f41
1 changed files with 14 additions and 15 deletions

View File

@ -102,22 +102,22 @@ class SizeLimitHandler(BaseHandler):
https_response = http_response https_response = http_response
import contextlib def UnGzip(cprss, CHUNKSIZE=64*1024): # the bigger the CHUNKSIZE, the faster
" Supports truncated files "
gz = GzipFile(fileobj=cprss, mode='rb')
@contextlib.contextmanager data = b''
def patch_gzip_for_partial(): chunk = gz.read(CHUNKSIZE)
"""
Context manager that replaces gzip.GzipFile._read_eof with a no-op.
This is useful when decompressing partial files, something that won't try:
work if GzipFile does it's checksum comparison. while chunk:
data += chunk
chunk = gz.read(CHUNKSIZE)
from https://stackoverflow.com/a/18602286 except (IOError, EOFError):
""" pass
_read_eof = GzipFile._read_eof
GzipFile._read_eof = lambda *args, **kwargs: None return data
yield
GzipFile._read_eof = _read_eof
class GZIPHandler(BaseHandler): class GZIPHandler(BaseHandler):
@ -130,8 +130,7 @@ class GZIPHandler(BaseHandler):
if resp.headers.get('Content-Encoding') == 'gzip': if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read() data = resp.read()
with patch_gzip_for_partial(): data = UnGzip(BytesIO(data))
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
resp.headers['Content-Encoding'] = 'identity' resp.headers['Content-Encoding'] = 'identity'