readabilite: change scoring
parent
9e71de8d40
commit
a6befad136
|
@ -66,35 +66,30 @@ attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
|
||||||
|
|
||||||
|
|
||||||
def score_node(node):
|
def score_node(node):
|
||||||
|
" Score individual node "
|
||||||
|
|
||||||
score = 0
|
score = 0
|
||||||
|
|
||||||
if isinstance(node, lxml.html.HtmlComment):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
class_id = node.get('class', '') + node.get('id', '')
|
class_id = node.get('class', '') + node.get('id', '')
|
||||||
|
|
||||||
score -= len(regex_bad.findall(class_id))
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
score -= len(regex_junk.findall(class_id))
|
or node.tag in tags_bad
|
||||||
score += len(regex_good.findall(class_id))
|
or regex_bad.search(class_id)):
|
||||||
|
return 0
|
||||||
wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node]))
|
|
||||||
# the .tail part is to include *everything* in that node
|
|
||||||
|
|
||||||
if wc > 10:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
if wc > 20:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
if wc > 30:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
if node.tag in tags_bad or node.tag in tags_junk:
|
|
||||||
score = -1 * abs(score)
|
|
||||||
|
|
||||||
if node.tag in tags_good:
|
if node.tag in tags_good:
|
||||||
|
score += 4
|
||||||
|
|
||||||
|
if regex_good.search(class_id):
|
||||||
score += 3
|
score += 3
|
||||||
|
|
||||||
|
wc = count_words(node.text_content())
|
||||||
|
|
||||||
|
score += min(int(wc/10), 3) # give 1pt bonus for every 10 words, max of 3
|
||||||
|
|
||||||
|
if wc != 0:
|
||||||
|
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
|
||||||
|
score = score * ( 1 - float(wca)/wc )
|
||||||
|
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue