Replace MetaRedirect handler with two cleaner ones
One for <meta http-equiv> and one for HTTP 'refresh' headermaster
parent
f2fe4fc364
commit
1b4fc88ad0
|
@ -137,28 +137,49 @@ class ContentNegociationHandler(BaseHandler): #FIXME
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class MetaRedirectHandler(BaseHandler):
|
class HTTPEquivHandler(BaseHandler):
|
||||||
|
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
||||||
|
|
||||||
|
handler_order = 600
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
|
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
|
||||||
if contenttype in MIMETYPE['html']:
|
if contenttype in MIMETYPE['html']:
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
match = re.search(b'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
|
||||||
if match:
|
|
||||||
new_url = match.groups()[0]
|
|
||||||
new_headers = dict((k, v) for k, v in list(req.headers.items())
|
|
||||||
if k.lower() not in ('content-length', 'content-type'))
|
|
||||||
new = Request(new_url,
|
|
||||||
headers=new_headers,
|
|
||||||
origin_req_host=req.get_origin_req_host(),
|
|
||||||
unverifiable=True)
|
|
||||||
|
|
||||||
return self.parent.open(new, timeout=req.timeout)
|
regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>'
|
||||||
else:
|
headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
|
||||||
fp = BytesIO(data)
|
|
||||||
old_resp = resp
|
for header in headers:
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
resp.headers[header['key'].lower()] = header['value']
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
fp = BytesIO(data)
|
||||||
|
old_resp = resp
|
||||||
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
class HTTPRefreshHandler(BaseHandler):
|
||||||
|
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
||||||
|
|
||||||
|
def http_response(self, req, resp):
|
||||||
|
if 200 <= resp.code < 300:
|
||||||
|
if resp.headers.get('refresh'):
|
||||||
|
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
||||||
|
match = re.search(regex, resp.headers.get('refresh'))
|
||||||
|
|
||||||
|
if match:
|
||||||
|
url = match.groupdict()['url']
|
||||||
|
|
||||||
|
if url:
|
||||||
|
resp.code = 302
|
||||||
|
resp.msg = 'Moved Temporarily'
|
||||||
|
resp.headers['location'] = url
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
|
|
|
@ -233,8 +233,8 @@ class Cache:
|
||||||
|
|
||||||
|
|
||||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
||||||
crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
|
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
|
||||||
crawler.EncodingFixHandler()]
|
crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()]
|
||||||
|
|
||||||
def accept_handler(*kargs):
|
def accept_handler(*kargs):
|
||||||
handlers = default_handlers[:]
|
handlers = default_handlers[:]
|
||||||
|
|
Loading…
Reference in New Issue