crawler: use chardet again
Always nice in case no encoding is specified. Somehow got dropped with commit 245ba99
. Most probably by accident
master
parent
4b8e3d1b8b
commit
ad9bf946ec
|
@ -6,6 +6,7 @@ import socket
|
||||||
from gzip import GzipFile
|
from gzip import GzipFile
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
import re
|
import re
|
||||||
|
import chardet
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -58,10 +59,14 @@ def detect_encoding(data, con=None):
|
||||||
if match:
|
if match:
|
||||||
return match.groups()[0].lower().decode()
|
return match.groups()[0].lower().decode()
|
||||||
|
|
||||||
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
||||||
if match:
|
if match:
|
||||||
return match.groups()[0].lower().decode()
|
return match.groups()[0].lower().decode()
|
||||||
|
|
||||||
|
enc = chardet.detect(data[:1000])['encoding']
|
||||||
|
if enc:
|
||||||
|
return enc
|
||||||
|
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue