crawler: support 308 redirects
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
parent
db8e046eae
commit
f6fb456679
|
@ -33,16 +33,17 @@ try:
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
import mimetools
|
import mimetools
|
||||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, Request, addinfourl,
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
build_opener, parse_http_list, parse_keqv_list)
|
Request, addinfourl, build_opener, parse_http_list,
|
||||||
|
parse_keqv_list)
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlparse, urlunparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
import email
|
import email
|
||||||
from urllib.parse import quote, urlparse, urlunparse
|
from urllib.parse import quote, urlparse, urlunparse
|
||||||
from urllib.request import (BaseHandler, HTTPCookieProcessor, Request,
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
addinfourl, build_opener, parse_http_list,
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
parse_keqv_list)
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
|
@ -134,6 +135,7 @@ def custom_opener(follow=None, delay=None):
|
||||||
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
handlers.append(SizeLimitHandler(500*1024)) # 500KiB
|
||||||
handlers.append(HTTPCookieProcessor())
|
handlers.append(HTTPCookieProcessor())
|
||||||
handlers.append(GZIPHandler())
|
handlers.append(GZIPHandler())
|
||||||
|
handlers.append(HTTPAllRedirectHandler())
|
||||||
handlers.append(HTTPEquivHandler())
|
handlers.append(HTTPEquivHandler())
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||||
|
@ -400,6 +402,11 @@ class HTTPEquivHandler(RespStrHandler):
|
||||||
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
||||||
|
def http_error_308(self, req, fp, code, msg, headers):
|
||||||
|
return self.http_error_302(self, req, fp, 301, msg, headers)
|
||||||
|
|
||||||
|
|
||||||
class HTTPRefreshHandler(BaseHandler):
|
class HTTPRefreshHandler(BaseHandler):
|
||||||
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue