crawler: catch html parsing errors

master
pictuga 2020-07-06 12:25:38 +02:00
parent 15951d228c
commit 64af86c11e
1 changed files with 21 additions and 10 deletions

View File

@ -311,14 +311,20 @@ class AlternateHandler(BaseHandler):
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read() data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links: try:
if link.get('type', '') in self.follow: links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
resp.code = 302
resp.msg = 'Moved Temporarily' for link in links:
resp.headers['location'] = link.get('href') if link.get('type', '') in self.follow:
break resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
break
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp
@ -340,10 +346,15 @@ class HTTPEquivHandler(BaseHandler):
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read() data = resp.read()
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') try:
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
for header in headers: for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content') resp.headers[header.get('http-equiv').lower()] = header.get('content')
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp