feeds & morss: many encoding/tostring fixes
parent
c09b457168
commit
bda51b0fc7
|
@ -136,35 +136,34 @@ class ParserBase(object):
|
||||||
# delete oneslf
|
# delete oneslf
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tostring(self):
|
def tostring(self, **k):
|
||||||
# output in its input format
|
# output in its input format
|
||||||
# to output in sth fancy (json, csv, html), change class type
|
# to output in sth fancy (json, csv, html), change class type with .convert first
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def torss(self):
|
def torss(self, **k):
|
||||||
return self.convert(FeedXML).tostring()
|
return self.convert(FeedXML).tostring(**k)
|
||||||
|
|
||||||
def tojson(self):
|
def tojson(self, **k):
|
||||||
return self.convert(FeedJSON).tostring()
|
return self.convert(FeedJSON).tostring(**k)
|
||||||
|
|
||||||
def tocsv(self):
|
def tocsv(self, encoding='unicode'):
|
||||||
# TODO temporary
|
|
||||||
out = StringIO()
|
out = StringIO()
|
||||||
c = csv.writer(out, dialect=csv.excel)
|
c = csv.writer(out, dialect=csv.excel)
|
||||||
|
|
||||||
for item in self.items:
|
for item in self.items:
|
||||||
row = [getattr(item, x) for x in item.dic]
|
row = [getattr(item, x) for x in item.dic]
|
||||||
|
|
||||||
if sys.version_info[0] < 3:
|
if encoding != 'unicode':
|
||||||
row = [x.encode('utf-8') if isinstance(x, unicode) else x for x in row]
|
row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
|
||||||
|
|
||||||
c.writerow(row)
|
c.writerow(row)
|
||||||
|
|
||||||
out.seek(0)
|
out.seek(0)
|
||||||
return out.read()
|
return out.read()
|
||||||
|
|
||||||
def tohtml(self):
|
def tohtml(self, **k):
|
||||||
return self.convert(FeedHTML).tostring()
|
return self.convert(FeedHTML).tostring(**k)
|
||||||
|
|
||||||
def convert(self, TargetParser):
|
def convert(self, TargetParser):
|
||||||
target = TargetParser()
|
target = TargetParser()
|
||||||
|
@ -297,8 +296,8 @@ class ParserXML(ParserBase):
|
||||||
def remove(self):
|
def remove(self):
|
||||||
return self.root.getparent().remove(self.root)
|
return self.root.getparent().remove(self.root)
|
||||||
|
|
||||||
def tostring(self, **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return etree.tostring(self.root, **k)
|
return etree.tostring(self.root, encoding=encoding, **k)
|
||||||
|
|
||||||
def _rule_parse(self, rule):
|
def _rule_parse(self, rule):
|
||||||
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
|
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
|
||||||
|
@ -443,8 +442,8 @@ class ParserHTML(ParserXML):
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
return lxml.html.fromstring(raw)
|
return lxml.html.fromstring(raw)
|
||||||
|
|
||||||
def tostring(self, **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return lxml.html.tostring(self.root, **k)
|
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||||
|
|
||||||
def rule_search_all(self, rule):
|
def rule_search_all(self, rule):
|
||||||
try:
|
try:
|
||||||
|
@ -499,9 +498,14 @@ class ParserJSON(ParserBase):
|
||||||
# delete oneself FIXME
|
# delete oneself FIXME
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def tostring(self):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return json.dumps(self.root, indent=True, ensure_ascii=False)
|
dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00
|
||||||
# ensure_ascii = False to have proper (unicode?) string and not \u00
|
|
||||||
|
if encoding != 'unicode':
|
||||||
|
return dump.encode(encoding)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return dump
|
||||||
|
|
||||||
def _rule_parse(self, rule):
|
def _rule_parse(self, rule):
|
||||||
return rule.split(".")
|
return rule.split(".")
|
||||||
|
@ -683,9 +687,9 @@ class Item(Uniq):
|
||||||
class FeedXML(Feed, ParserXML):
|
class FeedXML(Feed, ParserXML):
|
||||||
itemsClass = 'ItemXML'
|
itemsClass = 'ItemXML'
|
||||||
|
|
||||||
def tostring(self, **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
# override needed due to "getroottree" inclusion
|
# override needed due to "getroottree" inclusion
|
||||||
return etree.tostring(self.root.getroottree(), **k)
|
return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
|
||||||
|
|
||||||
|
|
||||||
class ItemXML(Item, ParserXML):
|
class ItemXML(Item, ParserXML):
|
||||||
|
|
|
@ -447,18 +447,23 @@ def FeedFormat(rss, options):
|
||||||
raise MorssException('Invalid callback var name')
|
raise MorssException('Invalid callback var name')
|
||||||
elif options.json:
|
elif options.json:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.tojson(indent=4)
|
return rss.tojson(encoding='UTF-8', indent=4)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.tojson()
|
return rss.tojson(encoding='UTF-8')
|
||||||
|
|
||||||
elif options.csv:
|
elif options.csv:
|
||||||
return rss.tocsv()
|
return rss.tocsv(encoding='UTF-8')
|
||||||
|
|
||||||
elif options.reader:
|
elif options.reader:
|
||||||
return rss.tohtml()
|
return rss.tohtml(encoding='UTF-8')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if options.indent:
|
if options.indent:
|
||||||
return rss.tostring(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return rss.tostring(xml_declaration=True, encoding='UTF-8')
|
return rss.torss(xml_declaration=True, encoding='UTF-8')
|
||||||
|
|
||||||
|
|
||||||
def process(url, cache=None, options=None):
|
def process(url, cache=None, options=None):
|
||||||
|
|
Loading…
Reference in New Issue