readabilite: improve cleaning code

2018-10-25 01:07:25 +02:00
parent f044c242ef
commit 58fe5243af
1 changed files with 41 additions and 21 deletions
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -133,44 +133,64 @@ def write_score_all(root, grades):
 def clean_node(node):
-    # Step 1. Do we keep the node?
+    parent = node.getparent()
-    if node.getparent() is None:
+    if parent is None:
-        # this is <html/>
+        # this is <html/> (or a removed element waiting for GC)
        return
-    if node.tag in tags_junk:
+    gdparent = parent.getparent()
    # remove shitty tags
    if node.tag in tags_junk:
        parent.remove(node)
        return
    # remove shitty class/id FIXME TODO too efficient, might want to add a toggle
    class_id = node.get('class', '') + node.get('id', '')
    if len(regex_bad.findall(class_id)) >= 2:
        node.getparent().remove(node)
        return
-    # Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
+    # remove shitty link
    if node.tag == 'a' and len(list(node.iter())) > 3:
        parent.remove(node)
        return
-    if node.tag in ['div'] \
+    # remove comments
-        and len(list(node.iterchildren())) <= 1 \
+    if isinstance(node, lxml.html.HtmlComment):
-        and not (node.text or '').strip() \
+        parent.remove(node)
-        and not (node.tail or '').strip():
+        return
    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
        if float(wca)/wc > 0.8:
            parent.remove(node)
            return
    # squash text-less elements shells
    if node.tag in tags_void:
        # keep 'em
        pass
    elif node.tag in tags_meaning:
        # remove if content-less
        if not count_content(node):
            parent.remove(node)
            return
    else:
        # squash non-meaningful if no direct text
        content = (node.text or '') + ' '.join([child.tail or '' for child in node])
        if not count_words(content):
            node.drop_tag()
            return
-    class_id = node.get('class', '') + node.get('id', '')
+    # for http://vice.com/fr/
-    if len(regex_junk.findall(class_id)) >= 2:
+    if node.tag == 'img' and 'data-src' in node.attrib:
-        # remove shitty class/id
+        node.attrib['src'] = node.attrib['data-src']
        node.getparent().remove(node)
        return
    if node.tag == 'a' and len(list(node.iter())) > 3:
        # shitty link
        node.getparent().remove(node)
        return
    if isinstance(node, lxml.html.HtmlComment):
        # remove comments
        node.getparent().remove(node)
        return
    # Step 2. Clean the node's attributes
    # clean the node's attributes
    for attrib in node.attrib:
        if attrib not in attributes_fine:
            del node.attrib[attrib]