From fb8825b41039228c5b88be01097508ec842b5d62 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 8 Mar 2017 17:50:57 -1000 Subject: [PATCH] crawler: parse html to get http-equiv For sure slower, but way cleaner (and probably more stable) --- morss/crawler.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 6eb5262..b8d8dd9 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -7,6 +7,7 @@ from gzip import GzipFile from io import BytesIO, StringIO import re import chardet +import lxml.html import sqlite3 import time @@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler): def http_response(self, req, resp): contenttype = resp.info().get('Content-Type', '').split(';')[0] - if 200 <= resp.code < 300 and contenttype.startswith('text/'): - if contenttype in MIMETYPE['html']: - data = resp.read() + if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: + data = resp.read() - regex = r'(?i)[^"\']+)\1\s+content=(["\'])(?P[^>]+)\3\s*/?>' - headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))] + headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') - for header in headers: - resp.headers[header['key'].lower()] = header['value'] + for header in headers: + resp.headers[header.get('http-equiv').lower()] = header.get('content') - fp = BytesIO(data) - old_resp = resp - resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg + fp = BytesIO(data) + old_resp = resp + resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg return resp