readabilite: change scoring

master
pictuga 2018-10-25 00:57:43 +02:00
parent 9e71de8d40
commit a6befad136
1 changed files with 17 additions and 22 deletions

View File

@ -66,35 +66,30 @@ attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
def score_node(node): def score_node(node):
" Score individual node "
score = 0 score = 0
if isinstance(node, lxml.html.HtmlComment):
return 0
class_id = node.get('class', '') + node.get('id', '') class_id = node.get('class', '') + node.get('id', '')
score -= len(regex_bad.findall(class_id)) if (isinstance(node, lxml.html.HtmlComment)
score -= len(regex_junk.findall(class_id)) or node.tag in tags_bad
score += len(regex_good.findall(class_id)) or regex_bad.search(class_id)):
return 0
wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node]))
# the .tail part is to include *everything* in that node
if wc > 10:
score += 1
if wc > 20:
score += 1
if wc > 30:
score += 1
if node.tag in tags_bad or node.tag in tags_junk:
score = -1 * abs(score)
if node.tag in tags_good: if node.tag in tags_good:
score += 4
if regex_good.search(class_id):
score += 3 score += 3
wc = count_words(node.text_content())
score += min(int(wc/10), 3) # give 1pt bonus for every 10 words, max of 3
if wc != 0:
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
score = score * ( 1 - float(wca)/wc )
return score return score