crawler: limit download to 500KiB

More can only be linked to a fraudulent/incorrect use of the service
master
pictuga 2017-10-27 23:12:40 +02:00
parent fbe811384a
commit 840842d246
1 changed files with 44 additions and 1 deletions

View File

@ -45,6 +45,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
# & HTTPSHandler
#handlers.append(DebugHandler())
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler())
handlers.append(HTTPEquivHandler())
@ -79,6 +80,45 @@ class DebugHandler(BaseHandler):
https_response = http_response
class SizeLimitHandler(BaseHandler):
""" Limit file size, defaults to 5MiB """
handler_order = 450
def __init__(self, limit=5*1024^2):
self.limit = limit
def http_response(self, req, resp):
data = resp.read(self.limit)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
import contextlib
@contextlib.contextmanager
def patch_gzip_for_partial():
"""
Context manager that replaces gzip.GzipFile._read_eof with a no-op.
This is useful when decompressing partial files, something that won't
work if GzipFile does it's checksum comparison.
from https://stackoverflow.com/a/18602286
"""
_read_eof = GzipFile._read_eof
GzipFile._read_eof = lambda *args, **kwargs: None
yield
GzipFile._read_eof = _read_eof
class GZIPHandler(BaseHandler):
def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip')
@ -88,7 +128,10 @@ class GZIPHandler(BaseHandler):
if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read()
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
with patch_gzip_for_partial():
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
resp.headers['Content-Encoding'] = 'identity'
fp = BytesIO(data)