From 65055290d4323011c1be404af83bc4b91ce0e4ba Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 18 Mar 2017 22:19:54 -1000 Subject: [PATCH] crawler: better use of chardet Scan whole doc since beginning of html pages tends to be too regular. Ignore ASCII detection for the same reason. --- morss/crawler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 205a408..2d45031 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -64,9 +64,9 @@ def detect_encoding(data, con=None): if match: return match.groups()[0].lower().decode() - enc = chardet.detect(data[:1000])['encoding'] - if enc: - return enc + enc = chardet.detect(data)['encoding'] + if enc and enc != 'ascii': + return enc return 'utf-8'