crawler: use chardet again

Always nice in case no encoding is specified. Somehow got dropped with commit 245ba99. Most probably by accident
master
pictuga 2017-03-08 11:37:12 -10:00 committed by GitHub
parent 4b8e3d1b8b
commit ad9bf946ec
1 changed files with 6 additions and 1 deletions

View File

@ -6,6 +6,7 @@ import socket
from gzip import GzipFile from gzip import GzipFile
from io import BytesIO, StringIO from io import BytesIO, StringIO
import re import re
import chardet
import sqlite3 import sqlite3
import time import time
@ -58,10 +59,14 @@ def detect_encoding(data, con=None):
if match: if match:
return match.groups()[0].lower().decode() return match.groups()[0].lower().decode()
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match: if match:
return match.groups()[0].lower().decode() return match.groups()[0].lower().decode()
enc = chardet.detect(data[:1000])['encoding']
if enc:
return enc
return 'utf-8' return 'utf-8'