feeds & morss: many encoding/tostring fixes

master
pictuga 2020-03-19 12:53:25 +01:00
parent c09b457168
commit bda51b0fc7
2 changed files with 36 additions and 27 deletions

View File

@ -136,35 +136,34 @@ class ParserBase(object):
# delete oneslf # delete oneslf
pass pass
def tostring(self): def tostring(self, **k):
# output in its input format # output in its input format
# to output in sth fancy (json, csv, html), change class type # to output in sth fancy (json, csv, html), change class type with .convert first
pass pass
def torss(self): def torss(self, **k):
return self.convert(FeedXML).tostring() return self.convert(FeedXML).tostring(**k)
def tojson(self): def tojson(self, **k):
return self.convert(FeedJSON).tostring() return self.convert(FeedJSON).tostring(**k)
def tocsv(self): def tocsv(self, encoding='unicode'):
# TODO temporary
out = StringIO() out = StringIO()
c = csv.writer(out, dialect=csv.excel) c = csv.writer(out, dialect=csv.excel)
for item in self.items: for item in self.items:
row = [getattr(item, x) for x in item.dic] row = [getattr(item, x) for x in item.dic]
if sys.version_info[0] < 3: if encoding != 'unicode':
row = [x.encode('utf-8') if isinstance(x, unicode) else x for x in row] row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row]
c.writerow(row) c.writerow(row)
out.seek(0) out.seek(0)
return out.read() return out.read()
def tohtml(self): def tohtml(self, **k):
return self.convert(FeedHTML).tostring() return self.convert(FeedHTML).tostring(**k)
def convert(self, TargetParser): def convert(self, TargetParser):
target = TargetParser() target = TargetParser()
@ -297,8 +296,8 @@ class ParserXML(ParserBase):
def remove(self): def remove(self):
return self.root.getparent().remove(self.root) return self.root.getparent().remove(self.root)
def tostring(self, **k): def tostring(self, encoding='unicode', **k):
return etree.tostring(self.root, **k) return etree.tostring(self.root, encoding=encoding, **k)
def _rule_parse(self, rule): def _rule_parse(self, rule):
test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href
@ -443,8 +442,8 @@ class ParserHTML(ParserXML):
def parse(self, raw): def parse(self, raw):
return lxml.html.fromstring(raw) return lxml.html.fromstring(raw)
def tostring(self, **k): def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, **k) return lxml.html.tostring(self.root, encoding=encoding, **k)
def rule_search_all(self, rule): def rule_search_all(self, rule):
try: try:
@ -499,9 +498,14 @@ class ParserJSON(ParserBase):
# delete oneself FIXME # delete oneself FIXME
pass pass
def tostring(self): def tostring(self, encoding='unicode', **k):
return json.dumps(self.root, indent=True, ensure_ascii=False) dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00
# ensure_ascii = False to have proper (unicode?) string and not \u00
if encoding != 'unicode':
return dump.encode(encoding)
else:
return dump
def _rule_parse(self, rule): def _rule_parse(self, rule):
return rule.split(".") return rule.split(".")
@ -683,9 +687,9 @@ class Item(Uniq):
class FeedXML(Feed, ParserXML): class FeedXML(Feed, ParserXML):
itemsClass = 'ItemXML' itemsClass = 'ItemXML'
def tostring(self, **k): def tostring(self, encoding='unicode', **k):
# override needed due to "getroottree" inclusion # override needed due to "getroottree" inclusion
return etree.tostring(self.root.getroottree(), **k) return etree.tostring(self.root.getroottree(), encoding=encoding, **k)
class ItemXML(Item, ParserXML): class ItemXML(Item, ParserXML):

View File

@ -447,18 +447,23 @@ def FeedFormat(rss, options):
raise MorssException('Invalid callback var name') raise MorssException('Invalid callback var name')
elif options.json: elif options.json:
if options.indent: if options.indent:
return rss.tojson(indent=4) return rss.tojson(encoding='UTF-8', indent=4)
else: else:
return rss.tojson() return rss.tojson(encoding='UTF-8')
elif options.csv: elif options.csv:
return rss.tocsv() return rss.tocsv(encoding='UTF-8')
elif options.reader: elif options.reader:
return rss.tohtml() return rss.tohtml(encoding='UTF-8')
else: else:
if options.indent: if options.indent:
return rss.tostring(xml_declaration=True, encoding='UTF-8', pretty_print=True) return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True)
else: else:
return rss.tostring(xml_declaration=True, encoding='UTF-8') return rss.torss(xml_declaration=True, encoding='UTF-8')
def process(url, cache=None, options=None): def process(url, cache=None, options=None):