crawler: support 308 redirects
This commit is contained in:
		@@ -33,16 +33,17 @@ try:
 | 
				
			|||||||
    from urllib import quote
 | 
					    from urllib import quote
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    import mimetools
 | 
					    import mimetools
 | 
				
			||||||
    from urllib2 import (BaseHandler, HTTPCookieProcessor, Request, addinfourl,
 | 
					    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
 | 
				
			||||||
                         build_opener, parse_http_list, parse_keqv_list)
 | 
					                         Request, addinfourl, build_opener, parse_http_list,
 | 
				
			||||||
 | 
					                         parse_keqv_list)
 | 
				
			||||||
    from urlparse import urlparse, urlunparse
 | 
					    from urlparse import urlparse, urlunparse
 | 
				
			||||||
except ImportError:
 | 
					except ImportError:
 | 
				
			||||||
    # python 3
 | 
					    # python 3
 | 
				
			||||||
    import email
 | 
					    import email
 | 
				
			||||||
    from urllib.parse import quote, urlparse, urlunparse
 | 
					    from urllib.parse import quote, urlparse, urlunparse
 | 
				
			||||||
    from urllib.request import (BaseHandler, HTTPCookieProcessor, Request,
 | 
					    from urllib.request import (BaseHandler, HTTPCookieProcessor,
 | 
				
			||||||
                                addinfourl, build_opener, parse_http_list,
 | 
					                                HTTPRedirectHandler, Request, addinfourl,
 | 
				
			||||||
                                parse_keqv_list)
 | 
					                                build_opener, parse_http_list, parse_keqv_list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
    # python 2
 | 
					    # python 2
 | 
				
			||||||
@@ -134,6 +135,7 @@ def custom_opener(follow=None, delay=None):
 | 
				
			|||||||
    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
 | 
					    handlers.append(SizeLimitHandler(500*1024)) # 500KiB
 | 
				
			||||||
    handlers.append(HTTPCookieProcessor())
 | 
					    handlers.append(HTTPCookieProcessor())
 | 
				
			||||||
    handlers.append(GZIPHandler())
 | 
					    handlers.append(GZIPHandler())
 | 
				
			||||||
 | 
					    handlers.append(HTTPAllRedirectHandler())
 | 
				
			||||||
    handlers.append(HTTPEquivHandler())
 | 
					    handlers.append(HTTPEquivHandler())
 | 
				
			||||||
    handlers.append(HTTPRefreshHandler())
 | 
					    handlers.append(HTTPRefreshHandler())
 | 
				
			||||||
    handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
 | 
					    handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
 | 
				
			||||||
@@ -400,6 +402,11 @@ class HTTPEquivHandler(RespStrHandler):
 | 
				
			|||||||
                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
 | 
					                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class HTTPAllRedirectHandler(HTTPRedirectHandler):
 | 
				
			||||||
 | 
					    def http_error_308(self, req, fp, code, msg, headers):
 | 
				
			||||||
 | 
					        return self.http_error_301(req, fp, 301, msg, headers)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class HTTPRefreshHandler(BaseHandler):
 | 
					class HTTPRefreshHandler(BaseHandler):
 | 
				
			||||||
    handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
 | 
					    handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user