From 7b85f692a07e28b800a77aa8d94612d99d3bb750 Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 27 Oct 2017 23:14:08 +0200 Subject: [PATCH] crawler: fix encoding detection --- morss/crawler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index ec2fd26..bdae931 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -7,6 +7,7 @@ from gzip import GzipFile from io import BytesIO, StringIO import re import chardet +from cgi import parse_header import lxml.html import sqlite3 import time @@ -145,9 +146,15 @@ class GZIPHandler(BaseHandler): https_request = http_request -def detect_encoding(data, con=None): - if con is not None and con.info().get('charset'): - return con.info().get('charset') +def detect_encoding(data, resp=None): + if resp is not None: + enc = resp.headers.get('charset') + if enc is not None: + return enc + + enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset') + if enc is not None: + return enc match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) if match: