From 3f92787b38b12eb9c25b5d784df204d58ba0447a Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 1 Jan 2022 13:58:42 +0100 Subject: [PATCH] readabilite: limit html comments related issues --- morss/readabilite.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index f5a7c43..1789c2a 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -148,15 +148,20 @@ def score_all(node): for child in node: score = score_node(child) - child.attrib['morss_own_score'] = str(float(score)) + set_score(child, score, 'morss_own_score') if score > 0 or len(list(child.iterancestors())) <= 2: spread_score(child, score) score_all(child) -def set_score(node, value): - node.attrib['morss_score'] = str(float(value)) +def set_score(node, value, label='morss_score'): + try: + node.attrib[label] = str(float(value)) + + except KeyError: + # catch issues with e.g. html comments + pass def get_score(node): @@ -196,6 +201,10 @@ def clean_root(root, keep_threshold=None): def clean_node(node, keep_threshold=None): parent = node.getparent() + if (isinstance(node, lxml.html.HtmlComment) + or isinstance(node, lxml.html.HtmlProcessingInstruction)): + return + if parent is None: # this is (or a removed element waiting for GC) return