From 64af86c11e2b48550411c0cce58f0c237c5a72c9 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Mon, 6 Jul 2020 12:25:38 +0200
Subject: [PATCH] crawler: catch html parsing errors

---
 morss/crawler.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/morss/crawler.py b/morss/crawler.py
index aced679..d98fb1c 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -311,14 +311,20 @@ class AlternateHandler(BaseHandler):
             # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
 
             data = resp.read()
-            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
 
-            for link in links:
-                if link.get('type', '') in self.follow:
-                    resp.code = 302
-                    resp.msg = 'Moved Temporarily'
-                    resp.headers['location'] = link.get('href')
-                    break
+            try:
+                links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
+
+                for link in links:
+                    if link.get('type', '') in self.follow:
+                        resp.code = 302
+                        resp.msg = 'Moved Temporarily'
+                        resp.headers['location'] = link.get('href')
+                        break
+
+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass
 
             fp = BytesIO(data)
             old_resp = resp
@@ -340,10 +346,15 @@ class HTTPEquivHandler(BaseHandler):
         if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
             data = resp.read()
 
-            headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
+            try:
+                headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
 
-            for header in headers:
-                resp.headers[header.get('http-equiv').lower()] = header.get('content')
+                for header in headers:
+                    resp.headers[header.get('http-equiv').lower()] = header.get('content')
+
+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass
 
             fp = BytesIO(data)
             old_resp = resp