parent
79a8ada9f4
commit
d6882e0a6a
|
@ -55,27 +55,30 @@ attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
|
||||||
def score_node(node):
|
def score_node(node):
|
||||||
score = 0
|
score = 0
|
||||||
|
|
||||||
if node.tag in tags_junk:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if isinstance(node, lxml.html.HtmlComment):
|
if isinstance(node, lxml.html.HtmlComment):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if node.tag in ['a']:
|
wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node]))
|
||||||
score -= 1
|
# the .tail part is to include *everything* in that node
|
||||||
|
|
||||||
if node.tag in ['h1', 'h2', 'article']:
|
if wc < 5:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if node.tag in ['h1', 'h2', 'h3', 'article']:
|
||||||
score += 8
|
score += 8
|
||||||
|
|
||||||
if node.tag in ['p']:
|
if node.tag in ['p', 'cite', 'section']:
|
||||||
score += 3
|
score += 3
|
||||||
|
|
||||||
class_id = node.get('class', '') + node.get('id', '')
|
class_id = node.get('class', '') + node.get('id', '')
|
||||||
|
|
||||||
score += len(regex_good.findall(class_id) * 4)
|
score += len(regex_good.findall(class_id) * 5)
|
||||||
score -= len(regex_bad.findall(class_id) * 3)
|
score -= len(regex_bad.findall(class_id) * 3)
|
||||||
|
|
||||||
score += count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) / 10. # the .tail part is to include *everything* in that node
|
score += wc / 5.
|
||||||
|
|
||||||
|
if node.tag in tags_junk or node.tag in ['a']:
|
||||||
|
score *= -1
|
||||||
|
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
@ -99,8 +102,9 @@ def score_all(root):
|
||||||
return grades
|
return grades
|
||||||
|
|
||||||
|
|
||||||
def get_best_node(root):
|
def write_score_all(root, grades):
|
||||||
return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0]
|
for item in root.iter():
|
||||||
|
item.attrib['score'] = str(int(grades[item]))
|
||||||
|
|
||||||
|
|
||||||
def clean_html(root):
|
def clean_html(root):
|
||||||
|
@ -156,5 +160,51 @@ def br2p(root):
|
||||||
gdparent.insert(gdparent.index(parent)+1, new_item)
|
gdparent.insert(gdparent.index(parent)+1, new_item)
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, encoding=None):
|
def lowest_common_ancestor(nodeA, nodeB, max_depth=None):
|
||||||
return lxml.etree.tostring(get_best_node(parse(data, encoding)))
|
ancestorsA = list(nodeA.iterancestors())
|
||||||
|
ancestorsB = list(nodeB.iterancestors())
|
||||||
|
|
||||||
|
if max_depth is not None:
|
||||||
|
ancestorsA = ancestorsA[:max_depth]
|
||||||
|
ancestorsB = ancestorsB[:max_depth]
|
||||||
|
|
||||||
|
ancestorsA.insert(0, nodeA)
|
||||||
|
ancestorsB.insert(0, nodeB)
|
||||||
|
|
||||||
|
for ancestorA in ancestorsA:
|
||||||
|
if ancestorA in ancestorsB:
|
||||||
|
return ancestorA
|
||||||
|
|
||||||
|
return nodeA # should always find one tho, at least <html/>
|
||||||
|
|
||||||
|
|
||||||
|
def rank_nodes(grades):
|
||||||
|
return sorted(grades.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_node(grades):
|
||||||
|
top = rank_nodes(grades)
|
||||||
|
|
||||||
|
if top[0][1] < top[1][1] * 1.6:
|
||||||
|
# we might still want to include the 2nd best node (great for articles split with images)
|
||||||
|
|
||||||
|
cmn_ancestor = lowest_common_ancestor(top[0][0], top[1][0], 3)
|
||||||
|
return cmn_ancestor
|
||||||
|
|
||||||
|
else:
|
||||||
|
return top[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def get_article(data, url=None, encoding=None):
|
||||||
|
html = parse(data, encoding)
|
||||||
|
br2p(html)
|
||||||
|
|
||||||
|
scores = score_all(html)
|
||||||
|
best = get_best_node(scores)
|
||||||
|
|
||||||
|
clean_html(best)
|
||||||
|
|
||||||
|
if url:
|
||||||
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
|
return lxml.etree.tostring(best)
|
||||||
|
|
Loading…
Reference in New Issue