From 22005065e8adfbf7297bb1983eee7199644bfd3f Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 13 May 2020 11:44:34 +0200 Subject: [PATCH] Use etree.tostring 'method' arg Gives appropriately formatted html code. Some pages might otherwise be rendered as blank. --- morss/feeds.py | 6 +++--- morss/morss.py | 4 ++-- morss/readabilite.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/morss/feeds.py b/morss/feeds.py index 1be9b27..13cb760 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -319,7 +319,7 @@ class ParserXML(ParserBase): return self.root.getparent().remove(self.root) def tostring(self, encoding='unicode', **k): - return etree.tostring(self.root, encoding=encoding, **k) + return etree.tostring(self.root, encoding=encoding, method='xml', **k) def _rule_parse(self, rule): test = re.search(r'^(.*)/@([a-z]+)$', rule) # to match //div/a/@href @@ -463,7 +463,7 @@ class ParserHTML(ParserXML): return html_parse(raw, encoding=self.encoding) def tostring(self, encoding='unicode', **k): - return lxml.html.tostring(self.root, encoding=encoding, **k) + return lxml.html.tostring(self.root, encoding=encoding, method='html', **k) def rule_search_all(self, rule): try: @@ -724,7 +724,7 @@ class FeedXML(Feed, ParserXML): if self.root.getprevious() is None: self.root.addprevious(etree.PI('xml-stylesheet', 'type="text/xsl" href="/sheet.xsl"')) - return etree.tostring(self.root.getroottree(), encoding=encoding, **k) + return etree.tostring(self.root.getroottree(), encoding=encoding, method='xml', **k) class ItemXML(Item, ParserXML): diff --git a/morss/morss.py b/morss/morss.py index 2f00f3e..a3f072a 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -294,7 +294,7 @@ def ItemAfter(item, options): for link in content.xpath('//a'): log(link.text_content()) link.drop_tag() - item.content = lxml.etree.tostring(content) + item.content = lxml.etree.tostring(content, method='html') if options.noref: item.link = '' @@ -612,7 +612,7 @@ def cgi_get(environ, start_response): for elem in html.xpath('//'+tag): elem.getparent().remove(elem) - output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') + output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') elif options.get == 'article': output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) diff --git a/morss/readabilite.py b/morss/readabilite.py index 26bdc4c..0a945e0 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -341,7 +341,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug= if url: best.make_links_absolute(url) - return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out) + return lxml.etree.tostring(best if not debug else html, method='html', encoding=encoding_out) if __name__ == '__main__':