crawler: fix encoding detection
parent
840842d246
commit
7b85f692a0
|
@ -7,6 +7,7 @@ from gzip import GzipFile
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
|
from cgi import parse_header
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
@ -145,9 +146,15 @@ class GZIPHandler(BaseHandler):
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(data, con=None):
|
def detect_encoding(data, resp=None):
|
||||||
if con is not None and con.info().get('charset'):
|
if resp is not None:
|
||||||
return con.info().get('charset')
|
enc = resp.headers.get('charset')
|
||||||
|
if enc is not None:
|
||||||
|
return enc
|
||||||
|
|
||||||
|
enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset')
|
||||||
|
if enc is not None:
|
||||||
|
return enc
|
||||||
|
|
||||||
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
||||||
if match:
|
if match:
|
||||||
|
|
Loading…
Reference in New Issue