readabilite: don't clean-out the top 10% nodes

Loosen up the code once again to limit over-kill
master
pictuga 2020-04-06 14:26:28 +02:00
parent 2f48e18bb1
commit dc9e425247
1 changed files with 17 additions and 5 deletions

View File

@ -44,6 +44,12 @@ def count_content(node):
return count_words(node.text_content()) + len(node.findall('.//img'))
def percentile(N, P):
# https://stackoverflow.com/a/7464107
n = max(int(round(P * len(N) + 0.5)), 2)
return N[n-2]
class_bad = ['comment', 'community', 'extra', 'foot',
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@ -166,19 +172,24 @@ def spread_score(node, score):
break
def clean_root(root):
def clean_root(root, keep_threshold=None):
for node in list(root):
clean_root(node)
clean_node(node)
# bottom-up approach, i.e. starting with children before cleaning current node
clean_root(node, keep_threshold)
clean_node(node, keep_threshold)
def clean_node(node):
def clean_node(node, keep_threshold=None):
parent = node.getparent()
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
if keep_threshold is not None and get_score(node) <= keep_threshold:
# high score, so keep
return
gdparent = parent.getparent()
# remove shitty tags
@ -313,6 +324,7 @@ def get_article(data, url=None, encoding=None):
if url:
best.make_links_absolute(url)
clean_root(best)
keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
return lxml.etree.tostring(best, pretty_print=True)