readabilite: don't clean-out the top 10% nodes

Loosen up the code once again to limit over-kill
2020-04-06 14:26:28 +02:00
parent 2f48e18bb1
commit dc9e425247
1 changed files with 17 additions and 5 deletions
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -44,6 +44,12 @@ def count_content(node):
    return count_words(node.text_content()) + len(node.findall('.//img'))


+def percentile(N, P):
+    # https://stackoverflow.com/a/7464107
+    n = max(int(round(P * len(N) + 0.5)), 2)
+    return N[n-2]
+
+
 class_bad = ['comment', 'community', 'extra', 'foot',
    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about',
@@ -166,19 +172,24 @@ def spread_score(node, score):
            break


-def clean_root(root):
+def clean_root(root, keep_threshold=None):
    for node in list(root):
-        clean_root(node)
-        clean_node(node)
+        # bottom-up approach, i.e. starting with children before cleaning current node
+        clean_root(node, keep_threshold)
+        clean_node(node, keep_threshold)


-def clean_node(node):
+def clean_node(node, keep_threshold=None):
    parent = node.getparent()

    if parent is None:
        # this is <html/> (or a removed element waiting for GC)
        return

+    if keep_threshold is not None and get_score(node) <= keep_threshold:
+        # high score, so keep
+        return
+
    gdparent = parent.getparent()

    # remove shitty tags
@@ -313,6 +324,7 @@ def get_article(data, url=None, encoding=None):
    if url:
        best.make_links_absolute(url)

-    clean_root(best)
+    keep_threshold = percentile([x[1] for x in scores], 0.1)
+    clean_root(best, keep_threshold)

    return lxml.etree.tostring(best, pretty_print=True)