From 7342ab26d2558efc9304168d04c94b55981f7558 Mon Sep 17 00:00:00 2001 From: pictuga Date: Thu, 25 Mar 2021 23:49:58 +0100 Subject: [PATCH] crawler: comment on how urllib works --- morss/crawler.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/morss/crawler.py b/morss/crawler.py index d25e8f6..465bf39 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -114,6 +114,15 @@ def custom_handler(follow=None, delay=None): # HTTPDefaultErrorHandler, HTTPRedirectHandler, # FTPHandler, FileHandler, HTTPErrorProcessor] # & HTTPSHandler + # + # when processing a request: + # (1) all the *_request are run + # (2) the *_open are run until sth is returned (other than None) + # (3) all the *_response are run + # + # During (3), if an http error occurs (i.e. not a 2XX response code), the + # http_error_* are run until sth is returned (other than None). If they all + # return nothing, a python error is raised #handlers.append(DebugHandler()) handlers.append(SizeLimitHandler(500*1024)) # 500KiB