2to3: crawler.py port urllib2 and StringIO
parent
27cf8f6498
commit
4f224888d8
|
@ -1,10 +1,18 @@
|
||||||
import urllib2
|
|
||||||
import httplib
|
import httplib
|
||||||
import ssl
|
import ssl
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
from gzip import GzipFile
|
from gzip import GzipFile
|
||||||
from StringIO import StringIO
|
|
||||||
|
try:
|
||||||
|
from StringIO import StringIO
|
||||||
|
from urllib2 import URLError
|
||||||
|
from urllib2 import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl
|
||||||
|
except:
|
||||||
|
from io import StringIO
|
||||||
|
from urllib.error import URLError
|
||||||
|
from urllib.request import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -16,7 +24,7 @@ MIMETYPE = {
|
||||||
|
|
||||||
# SSL-related code proudly copy-pasted from https://stackoverflow.com/questions/1087227/validate-ssl-certificates-with-python
|
# SSL-related code proudly copy-pasted from https://stackoverflow.com/questions/1087227/validate-ssl-certificates-with-python
|
||||||
|
|
||||||
class InvalidCertificateException(httplib.HTTPException, urllib2.URLError):
|
class InvalidCertificateException(httplib.HTTPException, URLError):
|
||||||
def __init__(self, host, cert, reason):
|
def __init__(self, host, cert, reason):
|
||||||
httplib.HTTPException.__init__(self)
|
httplib.HTTPException.__init__(self)
|
||||||
self.host = host
|
self.host = host
|
||||||
|
@ -72,9 +80,9 @@ class CertValidatingHTTPSConnection(httplib.HTTPConnection):
|
||||||
'hostname mismatch')
|
'hostname mismatch')
|
||||||
|
|
||||||
|
|
||||||
class VerifiedHTTPSHandler(urllib2.HTTPSHandler):
|
class VerifiedHTTPSHandler(HTTPSHandler):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
urllib2.AbstractHTTPHandler.__init__(self)
|
AbstractHTTPHandler.__init__(self)
|
||||||
self._connection_args = kwargs
|
self._connection_args = kwargs
|
||||||
|
|
||||||
def https_open(self, req):
|
def https_open(self, req):
|
||||||
|
@ -85,18 +93,18 @@ class VerifiedHTTPSHandler(urllib2.HTTPSHandler):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return self.do_open(http_class_wrapper, req)
|
return self.do_open(http_class_wrapper, req)
|
||||||
except urllib2.URLError, e:
|
except URLError, e:
|
||||||
if type(e.reason) == ssl.SSLError and e.reason.args[0] == 1:
|
if type(e.reason) == ssl.SSLError and e.reason.args[0] == 1:
|
||||||
raise InvalidCertificateException(req.host, '',
|
raise InvalidCertificateException(req.host, '',
|
||||||
e.reason.args[1])
|
e.reason.args[1])
|
||||||
raise
|
raise
|
||||||
|
|
||||||
https_request = urllib2.HTTPSHandler.do_request_
|
https_request = HTTPSHandler.do_request_
|
||||||
|
|
||||||
# end of copy-paste code
|
# end of copy-paste code
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(urllib2.BaseHandler):
|
class GZIPHandler(BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
return req
|
return req
|
||||||
|
@ -109,7 +117,7 @@ class GZIPHandler(urllib2.BaseHandler):
|
||||||
|
|
||||||
fp = StringIO(data)
|
fp = StringIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
resp.msg = old_resp.msg
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
@ -133,7 +141,7 @@ def detect_encoding(data, con=None):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class EncodingFixHandler(urllib2.BaseHandler):
|
class EncodingFixHandler(BaseHandler):
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
|
@ -145,7 +153,7 @@ class EncodingFixHandler(urllib2.BaseHandler):
|
||||||
|
|
||||||
fp = StringIO(data)
|
fp = StringIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
resp.msg = old_resp.msg
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
@ -153,7 +161,7 @@ class EncodingFixHandler(urllib2.BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
class UAHandler(urllib2.BaseHandler):
|
class UAHandler(BaseHandler):
|
||||||
def __init__(self, useragent=None):
|
def __init__(self, useragent=None):
|
||||||
self.useragent = useragent
|
self.useragent = useragent
|
||||||
|
|
||||||
|
@ -165,7 +173,7 @@ class UAHandler(urllib2.BaseHandler):
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class AutoRefererHandler(urllib2.BaseHandler):
|
class AutoRefererHandler(BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
if req.get_host() != 'feeds.feedburner.com':
|
if req.get_host() != 'feeds.feedburner.com':
|
||||||
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
||||||
|
@ -174,7 +182,7 @@ class AutoRefererHandler(urllib2.BaseHandler):
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class ContentNegociationHandler(urllib2.BaseHandler): #FIXME
|
class ContentNegociationHandler(BaseHandler): #FIXME
|
||||||
def __init__(self, accept=None, strict=False):
|
def __init__(self, accept=None, strict=False):
|
||||||
self.accept = accept
|
self.accept = accept
|
||||||
self.strict = strict
|
self.strict = strict
|
||||||
|
@ -211,7 +219,7 @@ class ContentNegociationHandler(urllib2.BaseHandler): #FIXME
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class MetaRedirectHandler(urllib2.BaseHandler):
|
class MetaRedirectHandler(BaseHandler):
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||||
if resp.info().type in MIMETYPE['html']:
|
if resp.info().type in MIMETYPE['html']:
|
||||||
|
@ -221,7 +229,7 @@ class MetaRedirectHandler(urllib2.BaseHandler):
|
||||||
new_url = match.groups()[0]
|
new_url = match.groups()[0]
|
||||||
new_headers = dict((k, v) for k, v in list(req.headers.items())
|
new_headers = dict((k, v) for k, v in list(req.headers.items())
|
||||||
if k.lower() not in ('content-length', 'content-type'))
|
if k.lower() not in ('content-length', 'content-type'))
|
||||||
new = urllib2.Request(new_url,
|
new = Request(new_url,
|
||||||
headers=new_headers,
|
headers=new_headers,
|
||||||
origin_req_host=req.get_origin_req_host(),
|
origin_req_host=req.get_origin_req_host(),
|
||||||
unverifiable=True)
|
unverifiable=True)
|
||||||
|
@ -230,7 +238,7 @@ class MetaRedirectHandler(urllib2.BaseHandler):
|
||||||
else:
|
else:
|
||||||
fp = StringIO(data)
|
fp = StringIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
resp.msg = old_resp.msg
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
@ -238,7 +246,7 @@ class MetaRedirectHandler(urllib2.BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
class EtagHandler(urllib2.BaseHandler):
|
class EtagHandler(BaseHandler):
|
||||||
def __init__(self, cache="", etag=None, lastmodified=None):
|
def __init__(self, cache="", etag=None, lastmodified=None):
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
self.etag = etag
|
self.etag = etag
|
||||||
|
@ -258,7 +266,7 @@ class EtagHandler(urllib2.BaseHandler):
|
||||||
headers.addheader('etag', self.etag)
|
headers.addheader('etag', self.etag)
|
||||||
if self.lastmodified:
|
if self.lastmodified:
|
||||||
headers.addheader('last-modified', self.lastmodified)
|
headers.addheader('last-modified', self.lastmodified)
|
||||||
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
resp = addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
Loading…
Reference in New Issue