From e136b0feb2d3d151fbbb239cc779853d57a85ffc Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 5 Apr 2020 20:47:30 +0200 Subject: [PATCH] readabilite: loosen the slayer Previous impl. lead to too many empty results --- morss/readabilite.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 0768315..1bfdd2a 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -63,7 +63,7 @@ regex_good = re.compile('|'.join(class_good), re.I) tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', - 'button', 'footer'] + 'button', 'footer', 'link', 'meta'] tags_bad = tags_junk + ['a', 'aside'] @@ -94,11 +94,18 @@ def score_node(node): class_id = node.get('class', '') + node.get('id', '') if (isinstance(node, lxml.html.HtmlComment) - or isinstance(node, lxml.html.HtmlProcessingInstruction) - or node.tag in tags_bad - or regex_bad.search(class_id)): + or isinstance(node, lxml.html.HtmlProcessingInstruction)): return 0 + if node.tag in tags_junk: + score += -1 # actuall -2 as tags_junk is included tags_bad + + if node.tag in tags_bad: + score += -1 + + if regex_bad.search(class_id): + score += -1 + if node.tag in tags_good: score += 4 @@ -126,7 +133,7 @@ def score_all(node, grades=None): score = score_node(child) child.attrib['seen'] = 'yes, ' + str(int(score)) - if score > 0: + if score > 0 or not len(grades): spread_score(child, score, grades) score_all(child, grades)