crawler: catch html parsing errors

2020-07-06 12:25:38 +02:00
parent 15951d228c
commit 64af86c11e
1 changed files with 21 additions and 10 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -311,6 +311,8 @@ class AlternateHandler(BaseHandler):
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types

            data = resp.read()
+
+            try:
                links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')

                for link in links:
@@ -320,6 +322,10 @@ class AlternateHandler(BaseHandler):
                        resp.headers['location'] = link.get('href')
                        break

+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass
+
            fp = BytesIO(data)
            old_resp = resp
            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
@@ -340,11 +346,16 @@ class HTTPEquivHandler(BaseHandler):
        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
            data = resp.read()

+            try:
                headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')

                for header in headers:
                    resp.headers[header.get('http-equiv').lower()] = header.get('content')

+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass
+
            fp = BytesIO(data)
            old_resp = resp
            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)