From 64af86c11e2b48550411c0cce58f0c237c5a72c9 Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 6 Jul 2020 12:25:38 +0200 Subject: [PATCH] crawler: catch html parsing errors --- morss/crawler.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index aced679..d98fb1c 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -311,14 +311,20 @@ class AlternateHandler(BaseHandler): # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types data = resp.read() - links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') - for link in links: - if link.get('type', '') in self.follow: - resp.code = 302 - resp.msg = 'Moved Temporarily' - resp.headers['location'] = link.get('href') - break + try: + links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') + + for link in links: + if link.get('type', '') in self.follow: + resp.code = 302 + resp.msg = 'Moved Temporarily' + resp.headers['location'] = link.get('href') + break + + except (ValueError, SyntaxError): + # catch parsing errors + pass fp = BytesIO(data) old_resp = resp @@ -340,10 +346,15 @@ class HTTPEquivHandler(BaseHandler): if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: data = resp.read() - headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') + try: + headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') - for header in headers: - resp.headers[header.get('http-equiv').lower()] = header.get('content') + for header in headers: + resp.headers[header.get('http-equiv').lower()] = header.get('content') + + except (ValueError, SyntaxError): + # catch parsing errors + pass fp = BytesIO(data) old_resp = resp