crawler: use chardet again

Always nice in case no encoding is specified. Somehow got dropped with commit 245ba99. Most probably by accident
master
pictuga 2017-03-08 11:37:12 -10:00 committed by GitHub
parent 4b8e3d1b8b
commit ad9bf946ec
1 changed files with 6 additions and 1 deletions

View File

@ -6,6 +6,7 @@ import socket
from gzip import GzipFile
from io import BytesIO, StringIO
import re
import chardet
import sqlite3
import time
@ -58,10 +59,14 @@ def detect_encoding(data, con=None):
if match:
return match.groups()[0].lower().decode()
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
return match.groups()[0].lower().decode()
enc = chardet.detect(data[:1000])['encoding']
if enc:
return enc
return 'utf-8'