From 91da0f36dcd625409d638630b67e56fb263b7e82 Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 24 Mar 2017 21:50:01 -1000 Subject: [PATCH] readabilite: comment the clean_html function --- morss/readabilite.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/morss/readabilite.py b/morss/readabilite.py index 8c371b9..ef6c8f2 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -112,6 +112,7 @@ def clean_html(root): # Step 1. Do we keep the node? if item.tag in tags_junk: + # remove shitty tags item.getparent().remove(item) continue @@ -119,15 +120,18 @@ def clean_html(root): and len(list(item.iterchildren())) <= 1 \ and not (item.text or '').strip() \ and not (item.tail or '').strip(): + # remove div with only one item inside item.drop_tag() continue class_id = item.get('class', '') + item.get('id', '') if regex_bad.match(class_id) is not None: + # remove shitty class/id item.getparent().remove(item) continue if isinstance(item, lxml.html.HtmlComment): + # remove comments item.getparent().remove(item) continue