crawler: limit download to 500KiB

More can only be linked to a fraudulent/incorrect use of the service
2017-10-27 23:12:40 +02:00
parent fbe811384a
commit 840842d246
1 changed files with 44 additions and 1 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -45,6 +45,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
    # & HTTPSHandler
    #handlers.append(DebugHandler())
    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
    handlers.append(HTTPCookieProcessor())
    handlers.append(GZIPHandler())
    handlers.append(HTTPEquivHandler())
@@ -79,6 +80,45 @@ class DebugHandler(BaseHandler):
    https_response = http_response
 class SizeLimitHandler(BaseHandler):
    """ Limit file size, defaults to 5MiB """
    handler_order = 450
    def __init__(self, limit=5*1024^2):
        self.limit = limit
    def http_response(self, req, resp):
        data = resp.read(self.limit)
        fp = BytesIO(data)
        old_resp = resp
        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
        resp.msg = old_resp.msg
        return resp
    https_response = http_response
 import contextlib
@contextlib.contextmanager
 def patch_gzip_for_partial():
    """
    Context manager that replaces gzip.GzipFile._read_eof with a no-op.
    This is useful when decompressing partial files, something that won't
    work if GzipFile does it's checksum comparison.
    from https://stackoverflow.com/a/18602286
    """
    _read_eof = GzipFile._read_eof
    GzipFile._read_eof = lambda *args, **kwargs: None
    yield
    GzipFile._read_eof = _read_eof
 class GZIPHandler(BaseHandler):
    def http_request(self, req):
        req.add_unredirected_header('Accept-Encoding', 'gzip')
@@ -88,7 +128,10 @@ class GZIPHandler(BaseHandler):
        if 200 <= resp.code < 300:
            if resp.headers.get('Content-Encoding') == 'gzip':
                data = resp.read()
                with patch_gzip_for_partial():
                    data = GzipFile(fileobj=BytesIO(data), mode='r').read()
                resp.headers['Content-Encoding'] = 'identity'
                fp = BytesIO(data)