From a32f5a85361e7befb0ab30e1667b2ceeb4e8fbc8 Mon Sep 17 00:00:00 2001 From: pictuga Date: Thu, 9 Apr 2020 19:08:13 +0200 Subject: [PATCH] readabilite: add debug option (also used by :get) --- morss/morss.py | 2 +- morss/readabilite.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/morss/morss.py b/morss/morss.py index c80c84a..4caa060 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -661,7 +661,7 @@ def cgi_get(environ, start_response): output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') elif options.get == 'article': - output = readabilite.get_article(data, url=con.geturl(), encoding=encoding) + output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug) else: raise MorssException('no :get option passed') diff --git a/morss/readabilite.py b/morss/readabilite.py index b5dad9c..ff0e1d1 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -307,7 +307,7 @@ def get_best_node(ranked_grades): return lowest -def get_article(data, url=None, encoding=None): +def get_article(data, url=None, encoding=None, debug=False): " Input a raw html string, returns a raw html string of the article " html = parse(data, encoding) @@ -319,16 +319,17 @@ def get_article(data, url=None, encoding=None): best = get_best_node(scores) - keep_threshold = percentile([x[1] for x in scores], 0.1) - clean_root(best, keep_threshold) + if not debug: + keep_threshold = percentile([x[1] for x in scores], 0.1) + clean_root(best, keep_threshold) wc = count_words(best.text_content()) wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) - if wc - wca < 50 or float(wca) / wc > 0.3: + if not debug and (wc - wca < 50 or float(wca) / wc > 0.3): return None if url: best.make_links_absolute(url) - return lxml.etree.tostring(best, pretty_print=True) + return lxml.etree.tostring(best if not debug else html, pretty_print=True)