diff --git a/morss/crawler.py b/morss/crawler.py
index 6eb5262..b8d8dd9 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -7,6 +7,7 @@ from gzip import GzipFile
from io import BytesIO, StringIO
import re
import chardet
+import lxml.html
import sqlite3
import time
@@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler):
def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
- if 200 <= resp.code < 300 and contenttype.startswith('text/'):
- if contenttype in MIMETYPE['html']:
- data = resp.read()
+ if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
+ data = resp.read()
- regex = r'(?i)[^"\']+)\1\s+content=(["\'])(?P[^>]+)\3\s*/?>'
- headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
+ headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
- for header in headers:
- resp.headers[header['key'].lower()] = header['value']
+ for header in headers:
+ resp.headers[header.get('http-equiv').lower()] = header.get('content')
- fp = BytesIO(data)
- old_resp = resp
- resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
- resp.msg = old_resp.msg
+ fp = BytesIO(data)
+ old_resp = resp
+ resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+ resp.msg = old_resp.msg
return resp