From 67889a1d1420da9ae4cd5ec0111bb3a1639385ea Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 24 Mar 2017 21:49:14 -1000 Subject: [PATCH] readabilite: drop useless tags This extra cluster actually jams the algorithm --- morss/readabilite.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/morss/readabilite.py b/morss/readabilite.py index 5fb58e6..8c371b9 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -115,6 +115,13 @@ def clean_html(root): item.getparent().remove(item) continue + if item.tag in ['div'] \ + and len(list(item.iterchildren())) <= 1 \ + and not (item.text or '').strip() \ + and not (item.tail or '').strip(): + item.drop_tag() + continue + class_id = item.get('class', '') + item.get('id', '') if regex_bad.match(class_id) is not None: item.getparent().remove(item)