From 9d64c31947236e385185ec58bf2c01d4e45318e1 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 24 Mar 2015 23:23:40 +0800 Subject: [PATCH] Feeds: use crawler.py encoding detection --- morss/feeds.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/morss/feeds.py b/morss/feeds.py index 49ed038..be919b6 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -12,6 +12,8 @@ from lxml import etree from dateutil import tz import dateutil.parser +from . import crawler + try: from wheezy.template.engine import Engine from wheezy.template.loader import DictLoader @@ -88,12 +90,9 @@ class FeedException(Exception): def parse(data): # encoding - match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) - if match: - enc = match.groups()[0].lower().decode() - if isinstance(data, bytes): - data = data.decode(enc, 'ignore') - data = data.encode(enc) + if isinstance(data, bytes): + enc = crawler.detect_encoding(data) + data = data.decode(enc, 'replace') # parse parser = etree.XMLParser(recover=True)