Feeds: use crawler.py encoding detection
parent
29d9e4702f
commit
9d64c31947
|
@ -12,6 +12,8 @@ from lxml import etree
|
||||||
from dateutil import tz
|
from dateutil import tz
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
|
||||||
|
from . import crawler
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from wheezy.template.engine import Engine
|
from wheezy.template.engine import Engine
|
||||||
from wheezy.template.loader import DictLoader
|
from wheezy.template.loader import DictLoader
|
||||||
|
@ -88,12 +90,9 @@ class FeedException(Exception):
|
||||||
|
|
||||||
def parse(data):
|
def parse(data):
|
||||||
# encoding
|
# encoding
|
||||||
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
|
||||||
if match:
|
|
||||||
enc = match.groups()[0].lower().decode()
|
|
||||||
if isinstance(data, bytes):
|
if isinstance(data, bytes):
|
||||||
data = data.decode(enc, 'ignore')
|
enc = crawler.detect_encoding(data)
|
||||||
data = data.encode(enc)
|
data = data.decode(enc, 'replace')
|
||||||
|
|
||||||
# parse
|
# parse
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
|
|
Loading…
Reference in New Issue