Fix bad characters detection
Now works with any encoding, no longer restricted to utf-8. Uses regex to find encoding (not perfect, but rather fast, since it's used on a substring)master
parent
3ba74649f6
commit
3176c2a8e8
9
feeds.py
9
feeds.py
|
@ -42,10 +42,17 @@ class FeedException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def parse(data):
|
def parse(data):
|
||||||
data = data.decode('utf-8', 'replace').encode('utf-8')
|
# encoding
|
||||||
|
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||||
|
if match:
|
||||||
|
enc = match.groups()[0].lower()
|
||||||
|
data = data.decode(enc, 'ignore').encode(enc)
|
||||||
|
|
||||||
|
# parse
|
||||||
parser = etree.XMLParser(recover=True)
|
parser = etree.XMLParser(recover=True)
|
||||||
doc = etree.fromstring(data, parser)
|
doc = etree.fromstring(data, parser)
|
||||||
|
|
||||||
|
# rss
|
||||||
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
|
match = doc.xpath("//atom03:feed|//atom:feed|//channel|//rdf:rdf|//rdf:RDF", namespaces=NSMAP)
|
||||||
if len(match):
|
if len(match):
|
||||||
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
|
mtable = { 'rdf:rdf': FeedParserRSS, 'channel': FeedParserRSS,
|
||||||
|
|
Loading…
Reference in New Issue