crawler: better use of chardet

Scan whole doc since beginning of html pages tends to be too regular. Ignore ASCII detection for the same reason.
master
pictuga 2017-03-18 22:19:54 -10:00
parent 9ee6ff60e1
commit 65055290d4
1 changed files with 3 additions and 3 deletions

View File

@ -64,9 +64,9 @@ def detect_encoding(data, con=None):
if match:
return match.groups()[0].lower().decode()
enc = chardet.detect(data[:1000])['encoding']
if enc:
return enc
enc = chardet.detect(data)['encoding']
if enc and enc != 'ascii':
return enc
return 'utf-8'