crawler: limit download to 500KiB
More can only be linked to a fraudulent/incorrect use of the servicemaster
parent
fbe811384a
commit
840842d246
|
@ -45,6 +45,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
||||||
# & HTTPSHandler
|
# & HTTPSHandler
|
||||||
|
|
||||||
#handlers.append(DebugHandler())
|
#handlers.append(DebugHandler())
|
||||||
|
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||||
handlers.append(HTTPCookieProcessor())
|
handlers.append(HTTPCookieProcessor())
|
||||||
handlers.append(GZIPHandler())
|
handlers.append(GZIPHandler())
|
||||||
handlers.append(HTTPEquivHandler())
|
handlers.append(HTTPEquivHandler())
|
||||||
|
@ -79,6 +80,45 @@ class DebugHandler(BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
class SizeLimitHandler(BaseHandler):
|
||||||
|
""" Limit file size, defaults to 5MiB """
|
||||||
|
|
||||||
|
handler_order = 450
|
||||||
|
|
||||||
|
def __init__(self, limit=5*1024^2):
|
||||||
|
self.limit = limit
|
||||||
|
|
||||||
|
def http_response(self, req, resp):
|
||||||
|
data = resp.read(self.limit)
|
||||||
|
|
||||||
|
fp = BytesIO(data)
|
||||||
|
old_resp = resp
|
||||||
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def patch_gzip_for_partial():
|
||||||
|
"""
|
||||||
|
Context manager that replaces gzip.GzipFile._read_eof with a no-op.
|
||||||
|
|
||||||
|
This is useful when decompressing partial files, something that won't
|
||||||
|
work if GzipFile does it's checksum comparison.
|
||||||
|
|
||||||
|
from https://stackoverflow.com/a/18602286
|
||||||
|
"""
|
||||||
|
_read_eof = GzipFile._read_eof
|
||||||
|
GzipFile._read_eof = lambda *args, **kwargs: None
|
||||||
|
yield
|
||||||
|
GzipFile._read_eof = _read_eof
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(BaseHandler):
|
class GZIPHandler(BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
|
@ -88,7 +128,10 @@ class GZIPHandler(BaseHandler):
|
||||||
if 200 <= resp.code < 300:
|
if 200 <= resp.code < 300:
|
||||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
|
|
||||||
|
with patch_gzip_for_partial():
|
||||||
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
|
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
|
||||||
|
|
||||||
resp.headers['Content-Encoding'] = 'identity'
|
resp.headers['Content-Encoding'] = 'identity'
|
||||||
|
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
|
|
Loading…
Reference in New Issue