readabilite: clean before counting

To remove links which are not kept anyway
master
pictuga 2020-04-06 16:55:39 +02:00
parent 6b8c3e51e7
commit bfad6b7a4a
1 changed files with 4 additions and 3 deletions

View File

@ -315,6 +315,10 @@ def get_article(data, url=None, encoding=None):
return None
best = get_best_node(scores)
keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
@ -324,7 +328,4 @@ def get_article(data, url=None, encoding=None):
if url:
best.make_links_absolute(url)
keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
return lxml.etree.tostring(best, pretty_print=True)