From 7b85f692a07e28b800a77aa8d94612d99d3bb750 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Fri, 27 Oct 2017 23:14:08 +0200
Subject: [PATCH] crawler: fix encoding detection

---
 morss/crawler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/morss/crawler.py b/morss/crawler.py
index ec2fd26..bdae931 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -7,6 +7,7 @@ from gzip import GzipFile
 from io import BytesIO, StringIO
 import re
 import chardet
+from cgi import parse_header
 import lxml.html
 import sqlite3
 import time
@@ -145,9 +146,15 @@ class GZIPHandler(BaseHandler):
     https_request = http_request
 
 
-def detect_encoding(data, con=None):
-    if con is not None and con.info().get('charset'):
-        return con.info().get('charset')
+def detect_encoding(data, resp=None):
+    if resp is not None:
+        enc = resp.headers.get('charset')
+        if enc is not None:
+            return enc
+
+        enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset')
+        if enc is not None:
+            return enc
 
     match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
     if match: