From bfad6b7a4ac44cd7af5b647b32fa179a96120ee7 Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 6 Apr 2020 16:55:39 +0200 Subject: [PATCH] readabilite: clean before counting To remove links which are not kept anyway --- morss/readabilite.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 87afdd4..7cbece7 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -315,6 +315,10 @@ def get_article(data, url=None, encoding=None): return None best = get_best_node(scores) + + keep_threshold = percentile([x[1] for x in scores], 0.1) + clean_root(best, keep_threshold) + wc = count_words(best.text_content()) wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) @@ -324,7 +328,4 @@ def get_article(data, url=None, encoding=None): if url: best.make_links_absolute(url) - keep_threshold = percentile([x[1] for x in scores], 0.1) - clean_root(best, keep_threshold) - return lxml.etree.tostring(best, pretty_print=True)