crawler: parse html to get http-equiv

For sure slower, but way cleaner (and probably more stable)
master
pictuga 2017-03-08 17:50:57 -10:00
parent 92b4a5c57c
commit fb8825b410
1 changed files with 10 additions and 11 deletions

View File

@ -7,6 +7,7 @@ from gzip import GzipFile
from io import BytesIO, StringIO from io import BytesIO, StringIO
import re import re
import chardet import chardet
import lxml.html
import sqlite3 import sqlite3
import time import time
@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler):
def http_response(self, req, resp): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype.startswith('text/'): if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
if contenttype in MIMETYPE['html']: data = resp.read()
data = resp.read()
regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>' headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
for header in headers: for header in headers:
resp.headers[header['key'].lower()] = header['value'] resp.headers[header.get('http-equiv').lower()] = header.get('content')
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg resp.msg = old_resp.msg
return resp return resp