crawler: support 308 redirects
continuous-integration/drone/push Build is passing Details

pictuga 2021-09-10 08:17:01 +02:00
parent db8e046eae
commit f6fb456679
1 changed files with 12 additions and 5 deletions

View File

@ -33,16 +33,17 @@ try:
from urllib import quote from urllib import quote
import mimetools import mimetools
from urllib2 import (BaseHandler, HTTPCookieProcessor, Request, addinfourl, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
build_opener, parse_http_list, parse_keqv_list) Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list)
from urlparse import urlparse, urlunparse from urlparse import urlparse, urlunparse
except ImportError: except ImportError:
# python 3 # python 3
import email import email
from urllib.parse import quote, urlparse, urlunparse from urllib.parse import quote, urlparse, urlunparse
from urllib.request import (BaseHandler, HTTPCookieProcessor, Request, from urllib.request import (BaseHandler, HTTPCookieProcessor,
addinfourl, build_opener, parse_http_list, HTTPRedirectHandler, Request, addinfourl,
parse_keqv_list) build_opener, parse_http_list, parse_keqv_list)
try: try:
# python 2 # python 2
@ -134,6 +135,7 @@ def custom_opener(follow=None, delay=None):
handlers.append(SizeLimitHandler(500*1024)) # 500KiB handlers.append(SizeLimitHandler(500*1024)) # 500KiB
handlers.append(HTTPCookieProcessor()) handlers.append(HTTPCookieProcessor())
handlers.append(GZIPHandler()) handlers.append(GZIPHandler())
handlers.append(HTTPAllRedirectHandler())
handlers.append(HTTPEquivHandler()) handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(random.choice(DEFAULT_UAS))) handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
@ -400,6 +402,11 @@ class HTTPEquivHandler(RespStrHandler):
resp.headers[meta.get('http-equiv').lower()] = meta.get('content') resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
class HTTPAllRedirectHandler(HTTPRedirectHandler):
def http_error_308(self, req, fp, code, msg, headers):
return self.http_error_302(self, req, fp, 301, msg, headers)
class HTTPRefreshHandler(BaseHandler): class HTTPRefreshHandler(BaseHandler):
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000 handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000