readabilite: put scores directly in html node
Probably slower but makes code somewhat cleaner...master
parent
31cac921c7
commit
2f48e18bb1
|
@ -123,33 +123,42 @@ def score_node(node):
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def score_all(node, grades=None):
|
def score_all(node):
|
||||||
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
|
" Fairly dumb loop to score all worthwhile nodes. Tries to be fast "
|
||||||
|
|
||||||
if grades is None:
|
|
||||||
grades = {}
|
|
||||||
|
|
||||||
for child in node:
|
for child in node:
|
||||||
score = score_node(child)
|
score = score_node(child)
|
||||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
child.attrib['seen'] = 'yes, ' + str(int(score))
|
||||||
|
|
||||||
if score > 0 or not len(grades):
|
if score > 0 or len(list(child.iterancestors())) <= 2:
|
||||||
spread_score(child, score, grades)
|
spread_score(child, score)
|
||||||
score_all(child, grades)
|
score_all(child)
|
||||||
|
|
||||||
return grades
|
|
||||||
|
|
||||||
|
|
||||||
def spread_score(node, score, grades):
|
def set_score(node, value):
|
||||||
|
node.attrib['morss_score'] = str(float(value))
|
||||||
|
|
||||||
|
|
||||||
|
def get_score(node):
|
||||||
|
return float(node.attrib.get('morss_score', 0))
|
||||||
|
|
||||||
|
|
||||||
|
def incr_score(node, delta):
|
||||||
|
set_score(node, get_score(node) + delta)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_scores(node):
|
||||||
|
return {x:get_score(x) for x in list(node.iter()) if get_score(x) != 0}
|
||||||
|
|
||||||
|
|
||||||
|
def spread_score(node, score):
|
||||||
" Spread the node's score to its parents, on a linear way "
|
" Spread the node's score to its parents, on a linear way "
|
||||||
|
|
||||||
delta = score / 2
|
delta = score / 2
|
||||||
|
|
||||||
for ancestor in [node,] + list(node.iterancestors()):
|
for ancestor in [node,] + list(node.iterancestors()):
|
||||||
if score >= 1 or ancestor is node:
|
if score >= 1 or ancestor is node:
|
||||||
try:
|
incr_score(ancestor, score)
|
||||||
grades[ancestor] += score
|
|
||||||
except KeyError:
|
|
||||||
grades[ancestor] = score
|
|
||||||
|
|
||||||
score -= delta
|
score -= delta
|
||||||
|
|
||||||
|
@ -157,13 +166,6 @@ def spread_score(node, score, grades):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def write_score_all(root, grades):
|
|
||||||
" Useful for debugging "
|
|
||||||
|
|
||||||
for node in root.iter():
|
|
||||||
node.attrib['score'] = str(int(grades.get(node, 0)))
|
|
||||||
|
|
||||||
|
|
||||||
def clean_root(root):
|
def clean_root(root):
|
||||||
for node in list(root):
|
for node in list(root):
|
||||||
clean_root(node)
|
clean_root(node)
|
||||||
|
@ -275,18 +277,18 @@ def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||||
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
return nodeA # should always find one tho, at least <html/>, but needed for max_depth
|
||||||
|
|
||||||
|
|
||||||
def rank_nodes(grades):
|
def rank_grades(grades):
|
||||||
|
# largest score to smallest
|
||||||
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def get_best_node(grades):
|
def get_best_node(ranked_grades):
|
||||||
" To pick the best (raw) node. Another function will clean it "
|
" To pick the best (raw) node. Another function will clean it "
|
||||||
|
|
||||||
if len(grades) == 1:
|
if len(ranked_grades) == 1:
|
||||||
return grades[0]
|
return ranked_grades[0]
|
||||||
|
|
||||||
top = rank_nodes(grades)
|
lowest = lowest_common_ancestor(ranked_grades[0][0], ranked_grades[1][0], 3)
|
||||||
lowest = lowest_common_ancestor(top[0][0], top[1][0], 3)
|
|
||||||
|
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
@ -295,7 +297,8 @@ def get_article(data, url=None, encoding=None):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
scores = score_all(html)
|
score_all(html)
|
||||||
|
scores = rank_grades(get_all_scores(html))
|
||||||
|
|
||||||
if not len(scores):
|
if not len(scores):
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Reference in New Issue