crawler: limit download to 500KiB
More can only be linked to a fraudulent/incorrect use of the servicemaster
parent
fbe811384a
commit
840842d246
|
@ -45,6 +45,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
|||
# & HTTPSHandler
|
||||
|
||||
#handlers.append(DebugHandler())
|
||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||
handlers.append(HTTPCookieProcessor())
|
||||
handlers.append(GZIPHandler())
|
||||
handlers.append(HTTPEquivHandler())
|
||||
|
@ -79,6 +80,45 @@ class DebugHandler(BaseHandler):
|
|||
https_response = http_response
|
||||
|
||||
|
||||
class SizeLimitHandler(BaseHandler):
|
||||
""" Limit file size, defaults to 5MiB """
|
||||
|
||||
handler_order = 450
|
||||
|
||||
def __init__(self, limit=5*1024^2):
|
||||
self.limit = limit
|
||||
|
||||
def http_response(self, req, resp):
|
||||
data = resp.read(self.limit)
|
||||
|
||||
fp = BytesIO(data)
|
||||
old_resp = resp
|
||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
import contextlib
|
||||
|
||||
@contextlib.contextmanager
|
||||
def patch_gzip_for_partial():
|
||||
"""
|
||||
Context manager that replaces gzip.GzipFile._read_eof with a no-op.
|
||||
|
||||
This is useful when decompressing partial files, something that won't
|
||||
work if GzipFile does it's checksum comparison.
|
||||
|
||||
from https://stackoverflow.com/a/18602286
|
||||
"""
|
||||
_read_eof = GzipFile._read_eof
|
||||
GzipFile._read_eof = lambda *args, **kwargs: None
|
||||
yield
|
||||
GzipFile._read_eof = _read_eof
|
||||
|
||||
|
||||
class GZIPHandler(BaseHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
|
@ -88,7 +128,10 @@ class GZIPHandler(BaseHandler):
|
|||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
data = resp.read()
|
||||
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
|
||||
|
||||
with patch_gzip_for_partial():
|
||||
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
|
||||
|
||||
resp.headers['Content-Encoding'] = 'identity'
|
||||
|
||||
fp = BytesIO(data)
|
||||
|
|
Loading…
Reference in New Issue