From 44a3e0edc4d20ca6202f22397def4bfc25f4d640 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 28 Apr 2020 14:44:35 +0200 Subject: [PATCH] readabilite: specify in- and out-going encoding --- morss/morss.py | 4 ++-- morss/readabilite.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/morss/morss.py b/morss/morss.py index 4f4b9af..e894546 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -260,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False): log('non-text page') return True - out = readabilite.get_article(data, url=con.geturl(), encoding=encoding) + out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') if out is not None: item.content = out @@ -635,7 +635,7 @@ def cgi_get(environ, start_response): output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') elif options.get == 'article': - output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug) + output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug) else: raise MorssException('no :get option passed') diff --git a/morss/readabilite.py b/morss/readabilite.py index 41a6991..e14f88b 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -316,10 +316,10 @@ def get_best_node(ranked_grades): return lowest -def get_article(data, url=None, encoding=None, debug=False, threshold=5): +def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5): " Input a raw html string, returns a raw html string of the article " - html = parse(data, encoding) + html = parse(data, encoding_in) score_all(html) scores = rank_grades(get_all_scores(html)) @@ -341,7 +341,7 @@ def get_article(data, url=None, encoding=None, debug=False, threshold=5): if url: best.make_links_absolute(url) - return lxml.etree.tostring(best if not debug else html, pretty_print=True) + return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out) if __name__ == '__main__': @@ -349,7 +349,7 @@ if __name__ == '__main__': from . import crawler data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') - article = get_article(data, url=con.geturl(), encoding=encoding) + article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') if not sys.flags.interactive: - print(article.decode(encoding)) + print(article)