diff --git a/morss/morss.py b/morss/morss.py index 4f4b9af..e894546 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -260,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False): log('non-text page') return True - out = readabilite.get_article(data, url=con.geturl(), encoding=encoding) + out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') if out is not None: item.content = out @@ -635,7 +635,7 @@ def cgi_get(environ, start_response): output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') elif options.get == 'article': - output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug) + output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug) else: raise MorssException('no :get option passed') diff --git a/morss/readabilite.py b/morss/readabilite.py index 41a6991..e14f88b 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -316,10 +316,10 @@ def get_best_node(ranked_grades): return lowest -def get_article(data, url=None, encoding=None, debug=False, threshold=5): +def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5): " Input a raw html string, returns a raw html string of the article " - html = parse(data, encoding) + html = parse(data, encoding_in) score_all(html) scores = rank_grades(get_all_scores(html)) @@ -341,7 +341,7 @@ def get_article(data, url=None, encoding=None, debug=False, threshold=5): if url: best.make_links_absolute(url) - return lxml.etree.tostring(best if not debug else html, pretty_print=True) + return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out) if __name__ == '__main__': @@ -349,7 +349,7 @@ if __name__ == '__main__': from . import crawler data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') - article = get_article(data, url=con.geturl(), encoding=encoding) + article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') if not sys.flags.interactive: - print(article.decode(encoding)) + print(article)