From c6c113b8a830c771c4a34eab62b0cf41be206c13 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Sat, 25 Feb 2017 18:15:33 -1000
Subject: [PATCH] readabilite: function to clean up the html code

---
 morss/readabilite.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/morss/readabilite.py b/morss/readabilite.py
index 663ed3b..40a67c2 100644
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -45,6 +45,9 @@ regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
 
 tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']
 
+attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
+
+
 def score_node(node):
     score = 0
 
@@ -93,5 +96,26 @@ def get_best_node(root):
     return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0]
 
 
+def clean_html(root):
+    for item in root.iter():
+        # Step 1. Do we keep the node?
+
+        if item.tag in tags_junk:
+            item.getparent().remove(item)
+
+        class_id = item.get('class', '') + item.get('id', '')
+        if regex_bad.match(class_id):
+            item.getparent().remove(item)
+
+        if isinstance(item, lxml.html.HtmlComment):
+            item.getparent().remove(item)
+
+        # Step 2. Clean the node's attributes
+
+        for attrib in item.attrib:
+            if attrib not in attributes_fine:
+                del item.attrib[attrib]
+
+
 def get_article(data):
     return lxml.etree.tostring(get_best_node(parse(data)))