From 840842d24611ef777e5306e3ce99a0e148d87dae Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 27 Oct 2017 23:12:40 +0200 Subject: [PATCH] crawler: limit download to 500KiB More can only be linked to a fraudulent/incorrect use of the service --- morss/crawler.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/morss/crawler.py b/morss/crawler.py index d5416eb..ec2fd26 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -45,6 +45,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F # & HTTPSHandler #handlers.append(DebugHandler()) + handlers.append(SizeLimitHandler(500*1024)) # 500KiB handlers.append(HTTPCookieProcessor()) handlers.append(GZIPHandler()) handlers.append(HTTPEquivHandler()) @@ -79,6 +80,45 @@ class DebugHandler(BaseHandler): https_response = http_response +class SizeLimitHandler(BaseHandler): + """ Limit file size, defaults to 5MiB """ + + handler_order = 450 + + def __init__(self, limit=5*1024^2): + self.limit = limit + + def http_response(self, req, resp): + data = resp.read(self.limit) + + fp = BytesIO(data) + old_resp = resp + resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + + return resp + + https_response = http_response + + +import contextlib + +@contextlib.contextmanager +def patch_gzip_for_partial(): + """ + Context manager that replaces gzip.GzipFile._read_eof with a no-op. + + This is useful when decompressing partial files, something that won't + work if GzipFile does it's checksum comparison. + + from https://stackoverflow.com/a/18602286 + """ + _read_eof = GzipFile._read_eof + GzipFile._read_eof = lambda *args, **kwargs: None + yield + GzipFile._read_eof = _read_eof + + class GZIPHandler(BaseHandler): def http_request(self, req): req.add_unredirected_header('Accept-Encoding', 'gzip') @@ -88,7 +128,10 @@ class GZIPHandler(BaseHandler): if 200 <= resp.code < 300: if resp.headers.get('Content-Encoding') == 'gzip': data = resp.read() - data = GzipFile(fileobj=BytesIO(data), mode='r').read() + + with patch_gzip_for_partial(): + data = GzipFile(fileobj=BytesIO(data), mode='r').read() + resp.headers['Content-Encoding'] = 'identity' fp = BytesIO(data)