crawler: catch html parsing errors
This commit is contained in:
		@@ -311,14 +311,20 @@ class AlternateHandler(BaseHandler):
 | 
				
			|||||||
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
 | 
					            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            data = resp.read()
 | 
					            data = resp.read()
 | 
				
			||||||
            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            for link in links:
 | 
					            try:
 | 
				
			||||||
                if link.get('type', '') in self.follow:
 | 
					                links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
 | 
				
			||||||
                    resp.code = 302
 | 
					
 | 
				
			||||||
                    resp.msg = 'Moved Temporarily'
 | 
					                for link in links:
 | 
				
			||||||
                    resp.headers['location'] = link.get('href')
 | 
					                    if link.get('type', '') in self.follow:
 | 
				
			||||||
                    break
 | 
					                        resp.code = 302
 | 
				
			||||||
 | 
					                        resp.msg = 'Moved Temporarily'
 | 
				
			||||||
 | 
					                        resp.headers['location'] = link.get('href')
 | 
				
			||||||
 | 
					                        break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            except (ValueError, SyntaxError):
 | 
				
			||||||
 | 
					                # catch parsing errors
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            fp = BytesIO(data)
 | 
					            fp = BytesIO(data)
 | 
				
			||||||
            old_resp = resp
 | 
					            old_resp = resp
 | 
				
			||||||
@@ -340,10 +346,15 @@ class HTTPEquivHandler(BaseHandler):
 | 
				
			|||||||
        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
 | 
					        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
 | 
				
			||||||
            data = resp.read()
 | 
					            data = resp.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
 | 
					            try:
 | 
				
			||||||
 | 
					                headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            for header in headers:
 | 
					                for header in headers:
 | 
				
			||||||
                resp.headers[header.get('http-equiv').lower()] = header.get('content')
 | 
					                    resp.headers[header.get('http-equiv').lower()] = header.get('content')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            except (ValueError, SyntaxError):
 | 
				
			||||||
 | 
					                # catch parsing errors
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            fp = BytesIO(data)
 | 
					            fp = BytesIO(data)
 | 
				
			||||||
            old_resp = resp
 | 
					            old_resp = resp
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user