crawler: better use of chardet

Scan whole doc since beginning of html pages tends to be too regular. Ignore ASCII detection for the same reason.
2017-03-18 22:19:54 -10:00
parent 9ee6ff60e1
commit 65055290d4
1 changed files with 3 additions and 3 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -64,9 +64,9 @@ def detect_encoding(data, con=None):
    if match:
        return match.groups()[0].lower().decode()

-    enc = chardet.detect(data[:1000])['encoding']
-    if enc:
-            return enc
+    enc = chardet.detect(data)['encoding']
+    if enc and enc != 'ascii':
+        return enc

    return 'utf-8'