From 2f48e18bb1e223b78fdb927fba89a6d486ba72e7 Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 6 Apr 2020 14:21:41 +0200 Subject: [PATCH] readabilite: put scores directly in html node Probably slower but makes code somewhat cleaner... --- morss/readabilite.py | 59 +++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 1bfdd2a..b5613a0 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -123,33 +123,42 @@ def score_node(node): return score -def score_all(node, grades=None): +def score_all(node): " Fairly dumb loop to score all worthwhile nodes. Tries to be fast " - if grades is None: - grades = {} - for child in node: score = score_node(child) child.attrib['seen'] = 'yes, ' + str(int(score)) - if score > 0 or not len(grades): - spread_score(child, score, grades) - score_all(child, grades) - - return grades + if score > 0 or len(list(child.iterancestors())) <= 2: + spread_score(child, score) + score_all(child) -def spread_score(node, score, grades): +def set_score(node, value): + node.attrib['morss_score'] = str(float(value)) + + +def get_score(node): + return float(node.attrib.get('morss_score', 0)) + + +def incr_score(node, delta): + set_score(node, get_score(node) + delta) + + +def get_all_scores(node): + return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0} + + +def spread_score(node, score): " Spread the node's score to its parents, on a linear way " delta = score / 2 + for ancestor in [node,] + list(node.iterancestors()): if score >= 1 or ancestor is node: - try: - grades[ancestor] += score - except KeyError: - grades[ancestor] = score + incr_score(ancestor, score) score -= delta @@ -157,13 +166,6 @@ def spread_score(node, score, grades): break -def write_score_all(root, grades): - " Useful for debugging " - - for node in root.iter(): - node.attrib['score'] = str(int(grades.get(node, 0))) - - def clean_root(root): for node in list(root): clean_root(node) @@ -275,18 +277,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None): return nodeA # should always find one tho, at least , but needed for max_depth -def rank_nodes(grades): +def rank_grades(grades): + # largest score to smallest return sorted(grades.items(), key=lambda x: x[1], reverse=True) -def get_best_node(grades): +def get_best_node(ranked_grades): " To pick the best (raw) node. Another function will clean it " - if len(grades) == 1: - return grades[0] + if len(ranked_grades) == 1: + return ranked_grades[0] - top = rank_nodes(grades) - lowest = lowest_common_ancestor(top[0][0], top[1][0], 3) + lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3) return lowest @@ -295,7 +297,8 @@ def get_article(data, url=None, encoding=None): " Input a raw html string, returns a raw html string of the article " html = parse(data, encoding) - scores = score_all(html) + score_all(html) + scores = rank_grades(get_all_scores(html)) if not len(scores): return None