2014-11-19 10:57:40 +00:00
|
|
|
import ssl
|
|
|
|
import socket
|
|
|
|
|
|
|
|
from gzip import GzipFile
|
2015-03-10 17:05:02 +00:00
|
|
|
from io import BytesIO
|
2015-02-25 09:53:36 +00:00
|
|
|
|
|
|
|
try:
|
2015-04-06 14:54:59 +00:00
|
|
|
from urllib2 import BaseHandler, addinfourl, parse_keqv_list, parse_http_list
|
2015-02-25 10:07:09 +00:00
|
|
|
except ImportError:
|
2015-04-06 14:54:59 +00:00
|
|
|
from urllib.request import BaseHandler, addinfourl, parse_keqv_list, parse_http_list
|
2015-02-25 09:53:36 +00:00
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
import re
|
|
|
|
|
2015-02-25 16:50:23 +00:00
|
|
|
try:
|
|
|
|
basestring
|
|
|
|
except NameError:
|
|
|
|
basestring = str
|
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
MIMETYPE = {
|
|
|
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
|
|
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class GZIPHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_request(self, req):
|
|
|
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
|
|
|
return req
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
|
|
|
if 200 <= resp.code < 300:
|
|
|
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
|
|
|
data = resp.read()
|
2015-03-10 17:05:02 +00:00
|
|
|
data = GzipFile(fileobj=BytesIO(data), mode='r').read()
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-03-10 17:05:02 +00:00
|
|
|
fp = BytesIO(data)
|
2014-11-19 10:57:40 +00:00
|
|
|
old_resp = resp
|
2015-02-25 09:53:36 +00:00
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
2014-11-19 10:57:40 +00:00
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
|
|
|
def detect_encoding(data, con=None):
|
2015-03-10 17:03:16 +00:00
|
|
|
if con is not None and con.info().get('charset'):
|
|
|
|
return con.info().get('charset')
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-03-10 17:05:02 +00:00
|
|
|
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
2015-03-10 17:05:02 +00:00
|
|
|
return match.groups()[0].lower().decode()
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-03-10 17:05:02 +00:00
|
|
|
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
2015-03-10 17:05:02 +00:00
|
|
|
return match.groups()[0].lower().decode()
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-03-24 15:22:56 +00:00
|
|
|
return 'utf-8'
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class EncodingFixHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_response(self, req, resp):
|
2015-03-10 17:03:16 +00:00
|
|
|
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
2015-03-02 16:55:58 +00:00
|
|
|
if 200 <= resp.code < 300 and maintype == 'text':
|
2014-11-19 10:57:40 +00:00
|
|
|
data = resp.read()
|
|
|
|
enc = detect_encoding(data, resp)
|
|
|
|
|
|
|
|
if enc:
|
|
|
|
data = data.decode(enc, 'replace')
|
|
|
|
data = data.encode(enc)
|
|
|
|
|
2015-03-10 17:05:02 +00:00
|
|
|
fp = BytesIO(data)
|
2014-11-19 10:57:40 +00:00
|
|
|
old_resp = resp
|
2015-02-25 09:53:36 +00:00
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
2014-11-19 10:57:40 +00:00
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class UAHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def __init__(self, useragent=None):
|
|
|
|
self.useragent = useragent
|
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
if self.useragent:
|
|
|
|
req.add_unredirected_header('User-Agent', self.useragent)
|
|
|
|
return req
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class AutoRefererHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_request(self, req):
|
2015-03-02 16:59:00 +00:00
|
|
|
if req.host != 'feeds.feedburner.com':
|
|
|
|
req.add_unredirected_header('Referer', 'http://%s' % req.host)
|
2014-11-19 10:57:40 +00:00
|
|
|
return req
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class ContentNegociationHandler(BaseHandler): #FIXME
|
2014-11-19 10:57:40 +00:00
|
|
|
def __init__(self, accept=None, strict=False):
|
|
|
|
self.accept = accept
|
|
|
|
self.strict = strict
|
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
if self.accept is not None:
|
|
|
|
if isinstance(self.accept, basestring):
|
|
|
|
self.accept = (self.accept,)
|
|
|
|
|
|
|
|
out = {}
|
|
|
|
rank = 1.1
|
|
|
|
for group in self.accept:
|
|
|
|
rank -= 0.1
|
|
|
|
|
|
|
|
if isinstance(group, basestring):
|
|
|
|
if group in MIMETYPE:
|
|
|
|
group = MIMETYPE[group]
|
|
|
|
else:
|
|
|
|
out[group] = rank
|
|
|
|
continue
|
|
|
|
|
|
|
|
for mime in group:
|
|
|
|
if mime not in out:
|
|
|
|
out[mime] = rank
|
|
|
|
|
|
|
|
if not self.strict:
|
|
|
|
out['*/*'] = rank - 0.1
|
|
|
|
|
|
|
|
string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
|
|
|
req.add_unredirected_header('Accept', string)
|
|
|
|
|
|
|
|
return req
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class MetaRedirectHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_response(self, req, resp):
|
2015-03-10 17:03:16 +00:00
|
|
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
2015-03-02 16:55:58 +00:00
|
|
|
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
|
|
|
|
if contenttype in MIMETYPE['html']:
|
2014-11-19 10:57:40 +00:00
|
|
|
data = resp.read()
|
2015-03-10 17:05:02 +00:00
|
|
|
match = re.search(b'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
|
|
|
new_url = match.groups()[0]
|
2015-02-25 04:02:53 +00:00
|
|
|
new_headers = dict((k, v) for k, v in list(req.headers.items())
|
2014-11-19 10:57:40 +00:00
|
|
|
if k.lower() not in ('content-length', 'content-type'))
|
2015-02-25 09:53:36 +00:00
|
|
|
new = Request(new_url,
|
2014-11-19 10:57:40 +00:00
|
|
|
headers=new_headers,
|
|
|
|
origin_req_host=req.get_origin_req_host(),
|
|
|
|
unverifiable=True)
|
|
|
|
|
|
|
|
return self.parent.open(new, timeout=req.timeout)
|
|
|
|
else:
|
2015-03-10 17:05:02 +00:00
|
|
|
fp = BytesIO(data)
|
2014-11-19 10:57:40 +00:00
|
|
|
old_resp = resp
|
2015-02-25 09:53:36 +00:00
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
2014-11-19 10:57:40 +00:00
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class EtagHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def __init__(self, cache="", etag=None, lastmodified=None):
|
|
|
|
self.cache = cache
|
|
|
|
self.etag = etag
|
|
|
|
self.lastmodified = lastmodified
|
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
if self.cache:
|
|
|
|
if self.etag:
|
|
|
|
req.add_unredirected_header('If-None-Match', self.etag)
|
|
|
|
if self.lastmodified:
|
|
|
|
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
|
|
|
|
|
|
|
return req
|
|
|
|
|
|
|
|
def http_error_304(self, req, fp, code, msg, headers):
|
|
|
|
if self.etag:
|
|
|
|
headers.addheader('etag', self.etag)
|
|
|
|
if self.lastmodified:
|
|
|
|
headers.addheader('last-modified', self.lastmodified)
|
2015-03-10 17:05:02 +00:00
|
|
|
resp = addinfourl(BytesIO(self.cache), headers, req.get_full_url(), 200)
|
2014-11-19 10:57:40 +00:00
|
|
|
return resp
|
|
|
|
|
|
|
|
https_request = http_request
|