From 58fe5243afb517a20235ac12e471e621056b06e8 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Thu, 25 Oct 2018 01:07:25 +0200
Subject: [PATCH] readabilite: improve cleaning code

---
 morss/readabilite.py | 62 +++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 21 deletions(-)
diff --git a/morss/readabilite.py b/morss/readabilite.py
index cd2c766..e8a061a 100644
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -133,44 +133,64 @@ def write_score_all(root, grades):
 
 
 def clean_node(node):
-    # Step 1. Do we keep the node?
+    parent = node.getparent()
 
-    if node.getparent() is None:
-        # this is <html/>
+    if parent is None:
+        # this is <html/> (or a removed element waiting for GC)
         return
 
+    gdparent = parent.getparent()
+
+    # remove shitty tags
     if node.tag in tags_junk:
-        # remove shitty tags
-        node.getparent().remove(node)
-        return
-
-    # Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
-
-    if node.tag in ['div'] \
-        and len(list(node.iterchildren())) <= 1 \
-        and not (node.text or '').strip() \
-        and not (node.tail or '').strip():
-        node.drop_tag()
+        parent.remove(node)
         return
 
+    # remove shitty class/id FIXME TODO too efficient, might want to add a toggle
     class_id = node.get('class', '') + node.get('id', '')
-    if len(regex_junk.findall(class_id)) >= 2:
-        # remove shitty class/id
+    if len(regex_bad.findall(class_id)) >= 2:
         node.getparent().remove(node)
         return
 
+    # remove shitty link
     if node.tag == 'a' and len(list(node.iter())) > 3:
-        # shitty link
-        node.getparent().remove(node)
+        parent.remove(node)
         return
 
+    # remove comments
     if isinstance(node, lxml.html.HtmlComment):
-        # remove comments
-        node.getparent().remove(node)
+        parent.remove(node)
         return
 
-    # Step 2. Clean the node's attributes
+    # remove if too many kids & too high link density
+    wc = count_words(node.text_content())
+    if wc != 0 and len(list(node.iter())) > 3:
+        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
+        if float(wca)/wc > 0.8:
+            parent.remove(node)
+            return
 
+    # squash text-less elements shells
+    if node.tag in tags_void:
+        # keep 'em
+        pass
+    elif node.tag in tags_meaning:
+        # remove if content-less
+        if not count_content(node):
+            parent.remove(node)
+            return
+    else:
+        # squash non-meaningful if no direct text
+        content = (node.text or '') + ' '.join([child.tail or '' for child in node])
+        if not count_words(content):
+            node.drop_tag()
+            return
+
+    # for http://vice.com/fr/
+    if node.tag == 'img' and 'data-src' in node.attrib:
+        node.attrib['src'] = node.attrib['data-src']
+
+    # clean the node's attributes
     for attrib in node.attrib:
         if attrib not in attributes_fine:
             del node.attrib[attrib]