Replace MetaRedirect handler with two cleaner ones

One for <meta http-equiv> and one for HTTP 'refresh' header
master
pictuga 2015-04-06 23:03:17 +08:00
parent f2fe4fc364
commit 1b4fc88ad0
2 changed files with 39 additions and 18 deletions

View File

@ -137,28 +137,49 @@ class ContentNegociationHandler(BaseHandler): #FIXME
https_request = http_request
class MetaRedirectHandler(BaseHandler):
class HTTPEquivHandler(BaseHandler):
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
handler_order = 600
def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
if contenttype in MIMETYPE['html']:
data = resp.read()
match = re.search(b'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
new_url = match.groups()[0]
new_headers = dict((k, v) for k, v in list(req.headers.items())
if k.lower() not in ('content-length', 'content-type'))
new = Request(new_url,
headers=new_headers,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
else:
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>'
headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
for header in headers:
resp.headers[header['key'].lower()] = header['value']
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPRefreshHandler(BaseHandler):
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
def http_response(self, req, resp):
if 200 <= resp.code < 300:
if resp.headers.get('refresh'):
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
match = re.search(regex, resp.headers.get('refresh'))
if match:
url = match.groupdict()['url']
if url:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = url
return resp

View File

@ -233,8 +233,8 @@ class Cache:
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
crawler.EncodingFixHandler()]
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()]
def accept_handler(*kargs):
handlers = default_handlers[:]