crawler: parse html to get http-equiv
For sure slower, but way cleaner (and probably more stable)master
parent
92b4a5c57c
commit
fb8825b410
|
@ -7,6 +7,7 @@ from gzip import GzipFile
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
|
import lxml.html
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -156,20 +157,18 @@ class HTTPEquivHandler(BaseHandler):
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
if 200 <= resp.code < 300 and contenttype.startswith('text/'):
|
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
||||||
if contenttype in MIMETYPE['html']:
|
data = resp.read()
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
regex = r'(?i)<meta\s+http-equiv=(["\'])(?P<key>[^"\']+)\1\s+content=(["\'])(?P<value>[^>]+)\3\s*/?>'
|
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
|
||||||
headers = [x.groupdict() for x in re.finditer(regex, data[:1000].decode('utf-8', 'replace'))]
|
|
||||||
|
|
||||||
for header in headers:
|
for header in headers:
|
||||||
resp.headers[header['key'].lower()] = header['value']
|
resp.headers[header.get('http-equiv').lower()] = header.get('content')
|
||||||
|
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
resp.msg = old_resp.msg
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue