crawler: better use of chardet
Scan whole doc since beginning of html pages tends to be too regular. Ignore ASCII detection for the same reason.master
parent
9ee6ff60e1
commit
65055290d4
|
@ -64,9 +64,9 @@ def detect_encoding(data, con=None):
|
||||||
if match:
|
if match:
|
||||||
return match.groups()[0].lower().decode()
|
return match.groups()[0].lower().decode()
|
||||||
|
|
||||||
enc = chardet.detect(data[:1000])['encoding']
|
enc = chardet.detect(data)['encoding']
|
||||||
if enc:
|
if enc and enc != 'ascii':
|
||||||
return enc
|
return enc
|
||||||
|
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue