Feeds: use crawler.py encoding detection

master
pictuga 2015-03-24 23:23:40 +08:00
parent 29d9e4702f
commit 9d64c31947
1 changed files with 5 additions and 6 deletions

View File

@ -12,6 +12,8 @@ from lxml import etree
from dateutil import tz from dateutil import tz
import dateutil.parser import dateutil.parser
from . import crawler
try: try:
from wheezy.template.engine import Engine from wheezy.template.engine import Engine
from wheezy.template.loader import DictLoader from wheezy.template.loader import DictLoader
@ -88,12 +90,9 @@ class FeedException(Exception):
def parse(data): def parse(data):
# encoding # encoding
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) if isinstance(data, bytes):
if match: enc = crawler.detect_encoding(data)
enc = match.groups()[0].lower().decode() data = data.decode(enc, 'replace')
if isinstance(data, bytes):
data = data.decode(enc, 'ignore')
data = data.encode(enc)
# parse # parse
parser = etree.XMLParser(recover=True) parser = etree.XMLParser(recover=True)