readabilite: clean_html before scoring

Surprisingly efficient
master
pictuga 2017-03-24 21:50:46 -10:00
parent bfefa8d599
commit 4aa25bf3d8
1 changed files with 2 additions and 2 deletions

View File

@ -208,13 +208,13 @@ def get_best_node(grades):
def get_article(data, url=None, encoding=None): def get_article(data, url=None, encoding=None):
html = parse(data, encoding) html = parse(data, encoding)
clean_html(html)
br2p(html) br2p(html)
scores = score_all(html) scores = score_all(html)
best = get_best_node(scores) best = get_best_node(scores)
clean_html(best)
if url: if url:
best.make_links_absolute(url) best.make_links_absolute(url)