From bda51b0fc7421c81c95cb5135217a6dd9b6670ff Mon Sep 17 00:00:00 2001 From: pictuga Date: Thu, 19 Mar 2020 12:53:25 +0100 Subject: [PATCH] feeds & morss: many encoding/tostring fixes --- morss/feeds.py | 46 +++++++++++++++++++++++++--------------------- morss/morss.py | 17 +++++++++++------ 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/morss/feeds.py b/morss/feeds.py index 4f51704..a02bbb6 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -136,35 +136,34 @@ class ParserBase(object): # delete oneslf pass - def tostring(self): + def tostring(self, **k): # output in its input format - # to output in sth fancy (json, csv, html), change class type + # to output in sth fancy (json, csv, html), change class type with .convert first pass - def torss(self): - return self.convert(FeedXML).tostring() + def torss(self, **k): + return self.convert(FeedXML).tostring(**k) - def tojson(self): - return self.convert(FeedJSON).tostring() + def tojson(self, **k): + return self.convert(FeedJSON).tostring(**k) - def tocsv(self): - # TODO temporary + def tocsv(self, encoding='unicode'): out = StringIO() c = csv.writer(out, dialect=csv.excel) for item in self.items: row = [getattr(item, x) for x in item.dic] - if sys.version_info[0] < 3: - row = [x.encode('utf-8') if isinstance(x, unicode) else x for x in row] + if encoding != 'unicode': + row = [x.encode(encoding) if isinstance(x, unicode) else x for x in row] c.writerow(row) out.seek(0) return out.read() - def tohtml(self): - return self.convert(FeedHTML).tostring() + def tohtml(self, **k): + return self.convert(FeedHTML).tostring(**k) def convert(self, TargetParser): target = TargetParser() @@ -297,8 +296,8 @@ class ParserXML(ParserBase): def remove(self): return self.root.getparent().remove(self.root) - def tostring(self, **k): - return etree.tostring(self.root, **k) + def tostring(self, encoding='unicode', **k): + return etree.tostring(self.root, encoding=encoding, **k) def _rule_parse(self, rule): test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href @@ -443,8 +442,8 @@ class ParserHTML(ParserXML): def parse(self, raw): return lxml.html.fromstring(raw) - def tostring(self, **k): - return lxml.html.tostring(self.root, **k) + def tostring(self, encoding='unicode', **k): + return lxml.html.tostring(self.root, encoding=encoding, **k) def rule_search_all(self, rule): try: @@ -499,9 +498,14 @@ class ParserJSON(ParserBase): # delete oneself FIXME pass - def tostring(self): - return json.dumps(self.root, indent=True, ensure_ascii=False) - # ensure_ascii = False to have proper (unicode?) string and not \u00 + def tostring(self, encoding='unicode', **k): + dump = json.dumps(self.root, ensure_ascii=False, **k) # ensure_ascii = False to have proper (unicode) string and not \u00 + + if encoding != 'unicode': + return dump.encode(encoding) + + else: + return dump def _rule_parse(self, rule): return rule.split(".") @@ -683,9 +687,9 @@ class Item(Uniq): class FeedXML(Feed, ParserXML): itemsClass = 'ItemXML' - def tostring(self, **k): + def tostring(self, encoding='unicode', **k): # override needed due to "getroottree" inclusion - return etree.tostring(self.root.getroottree(), **k) + return etree.tostring(self.root.getroottree(), encoding=encoding, **k) class ItemXML(Item, ParserXML): diff --git a/morss/morss.py b/morss/morss.py index 8ac301c..e45d12a 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -447,18 +447,23 @@ def FeedFormat(rss, options): raise MorssException('Invalid callback var name') elif options.json: if options.indent: - return rss.tojson(indent=4) + return rss.tojson(encoding='UTF-8', indent=4) + else: - return rss.tojson() + return rss.tojson(encoding='UTF-8') + elif options.csv: - return rss.tocsv() + return rss.tocsv(encoding='UTF-8') + elif options.reader: - return rss.tohtml() + return rss.tohtml(encoding='UTF-8') + else: if options.indent: - return rss.tostring(xml_declaration=True, encoding='UTF-8', pretty_print=True) + return rss.torss(xml_declaration=True, encoding='UTF-8', pretty_print=True) + else: - return rss.tostring(xml_declaration=True, encoding='UTF-8') + return rss.torss(xml_declaration=True, encoding='UTF-8') def process(url, cache=None, options=None):