From ad9bf946ecdc0a1ffae34d4ce626266be807c3b3 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 8 Mar 2017 11:37:12 -1000 Subject: [PATCH] crawler: use chardet again Always nice in case no encoding is specified. Somehow got dropped with commit 245ba99. Most probably by accident --- morss/crawler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/morss/crawler.py b/morss/crawler.py index e32c339..6eb5262 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -6,6 +6,7 @@ import socket from gzip import GzipFile from io import BytesIO, StringIO import re +import chardet import sqlite3 import time @@ -58,10 +59,14 @@ def detect_encoding(data, con=None): if match: return match.groups()[0].lower().decode() - match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) + match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000]) if match: return match.groups()[0].lower().decode() + enc = chardet.detect(data[:1000])['encoding'] + if enc: + return enc + return 'utf-8'