Only perform <meta> redirects on html pages

master
pictuga 2013-09-15 15:33:14 +02:00
parent 3176c2a8e8
commit c25aec7107
1 changed files with 12 additions and 11 deletions

View File

@ -196,19 +196,20 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
data = GzipFile(fileobj=StringIO(data), mode='r').read() data = GzipFile(fileobj=StringIO(data), mode='r').read()
# <meta> redirect # <meta> redirect
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data) if resp.info().type in ['text/html', 'application/xhtml+xml']:
if match: match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
newurl = match.groups()[0] if match:
log('redirect: %s' % newurl) newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items() newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type')) if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl, new = urllib2.Request(newurl,
headers=newheaders, headers=newheaders,
origin_req_host=req.get_origin_req_host(), origin_req_host=req.get_origin_req_host(),
unverifiable=True) unverifiable=True)
return self.parent.open(new, timeout=req.timeout) return self.parent.open(new, timeout=req.timeout)
# decode # decode
data = decodeHTML(resp, data) data = decodeHTML(resp, data)