From fb8825b41039228c5b88be01097508ec842b5d62 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Wed, 8 Mar 2017 17:50:57 -1000
Subject: [PATCH] crawler: parse html to get http-equiv

For sure slower, but way cleaner (and probably more stable)
---
 morss/crawler.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/morss/crawler.py b/morss/crawler.py
index 6eb5262..b8d8dd9 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -7,6 +7,7 @@ from gzip import GzipFile
 from io import BytesIO, StringIO
 import re
 import chardet
+import lxml.html
 import sqlite3
 import time
 
@@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler):
 
     def http_response(self, req, resp):
         contenttype = resp.info().get('Content-Type', '').split(';')[0]
-        if 200 <= resp.code < 300 and contenttype.startswith('text/'):
-            if contenttype in MIMETYPE['html']:
-                data = resp.read()
+        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
+            data = resp.read()
 
-                regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>'
-                headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
+            headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
 
-                for header in headers:
-                    resp.headers[header['key'].lower()] = header['value']
+            for header in headers:
+                resp.headers[header.get('http-equiv').lower()] = header.get('content')
 
-                fp = BytesIO(data)
-                old_resp = resp
-                resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-                resp.msg = old_resp.msg
+            fp = BytesIO(data)
+            old_resp = resp
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
 
         return resp