crawler: fix encoding detection

master
pictuga 2017-10-27 23:14:08 +02:00
parent 840842d246
commit 7b85f692a0
1 changed files with 10 additions and 3 deletions

View File

@ -7,6 +7,7 @@ from gzip import GzipFile
from io import BytesIO, StringIO from io import BytesIO, StringIO
import re import re
import chardet import chardet
from cgi import parse_header
import lxml.html import lxml.html
import sqlite3 import sqlite3
import time import time
@ -145,9 +146,15 @@ class GZIPHandler(BaseHandler):
https_request = http_request https_request = http_request
def detect_encoding(data, con=None): def detect_encoding(data, resp=None):
if con is not None and con.info().get('charset'): if resp is not None:
return con.info().get('charset') enc = resp.headers.get('charset')
if enc is not None:
return enc
enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset')
if enc is not None:
return enc
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match: if match: