Only perform <meta> redirects on html pages

master
pictuga 2013-09-15 15:33:14 +02:00
parent 3176c2a8e8
commit c25aec7107
1 changed files with 12 additions and 11 deletions

View File

@ -196,19 +196,20 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
data = GzipFile(fileobj=StringIO(data), mode='r').read()
# <meta> redirect
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
if resp.info().type in ['text/html', 'application/xhtml+xml']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
return self.parent.open(new, timeout=req.timeout)
# decode
data = decodeHTML(resp, data)