readabilite: change scoring algorithm
Use 3 groups of keywords instead
This commit is contained in:
		@@ -35,19 +35,29 @@ def count_words(string):
 | 
				
			|||||||
    return count
 | 
					    return count
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
regex_bad = re.compile('|'.join(['robots-nocontent', 'combx', 'comment',
 | 
					regex_bad = re.compile('|'.join(['comment', 'community', 'extra', 'foot',
 | 
				
			||||||
    'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss',
 | 
					    'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
 | 
				
			||||||
    'shoutbox', 'sidebar', 'sponsor', 'ad-', 'agegate', 'pagination',
 | 
					    'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about']),
 | 
				
			||||||
    'pager', 'popup', 'tweet', 'twitter', 'com-', 'sharing', 'share', 'social',
 | 
					    re.I)
 | 
				
			||||||
    'contact', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
 | 
					 | 
				
			||||||
    'related', 'scroll', 'shoutbox', 'shopping', 'tags',
 | 
					 | 
				
			||||||
    'tool', 'widget', 'hide', 'author', 'about']), re.I)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
 | 
					regex_junk = re.compile('|'.join(['robots-nocontent', 'combx', 'disqus',
 | 
				
			||||||
    'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
 | 
					    'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'ad-', 'agegate',
 | 
				
			||||||
    'pagination', 'post', 'text', 'blog', 'story', 'par', 'editorial']), re.I)
 | 
					    'popup', 'sharing', 'share', 'social', 'contact', 'footnote', 'outbrain',
 | 
				
			||||||
 | 
					    'promo', 'scroll', 'hidden', 'widget', 'hide']), re.I)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					regex_good = re.compile('|'.join(['and', 'article', 'body', 'column', 'main',
 | 
				
			||||||
 | 
					    'shadow', 'content', 'entry', 'hentry', 'main', 'page', 'pagination',
 | 
				
			||||||
 | 
					    'post', 'text', 'blog', 'story', 'par', 'editorial']), re.I)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					tags_bad = ['a']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed',
 | 
				
			||||||
 | 
					    'layer', 'applet', 'style', 'form', 'input', 'textarea', 'button', 'footer']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					tags_good = ['h1', 'h2', 'h3', 'article', 'p', 'cite', 'section', 'img',
 | 
				
			||||||
 | 
					    'figcaption', 'figure']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', 'button']
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
 | 
					attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -58,28 +68,30 @@ def score_node(node):
 | 
				
			|||||||
    if isinstance(node, lxml.html.HtmlComment):
 | 
					    if isinstance(node, lxml.html.HtmlComment):
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class_id = node.get('class', '') + node.get('id', '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    score -= len(regex_bad.findall(class_id))
 | 
				
			||||||
 | 
					    score -= len(regex_junk.findall(class_id))
 | 
				
			||||||
 | 
					    score += len(regex_good.findall(class_id))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node]))
 | 
					    wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node]))
 | 
				
			||||||
    # the .tail part is to include *everything* in that node
 | 
					    # the .tail part is to include *everything* in that node
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if wc < 5:
 | 
					    if wc > 10:
 | 
				
			||||||
        return 0
 | 
					        score += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if node.tag in ['h1', 'h2', 'h3', 'article']:
 | 
					    if wc > 20:
 | 
				
			||||||
        score += 8
 | 
					        score += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if node.tag in ['p', 'cite', 'section']:
 | 
					    if wc > 30:
 | 
				
			||||||
 | 
					        score += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if node.tag in tags_bad or node.tag in tags_junk:
 | 
				
			||||||
 | 
					        score = -1 * abs(score)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if node.tag in tags_good:
 | 
				
			||||||
        score += 3
 | 
					        score += 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class_id = node.get('class', '') + node.get('id', '')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    score += len(regex_good.findall(class_id) * 5)
 | 
					 | 
				
			||||||
    score -= len(regex_bad.findall(class_id) * 3)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    score += wc / 5.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if node.tag in tags_junk or node.tag in ['a']:
 | 
					 | 
				
			||||||
        score *= -1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return score
 | 
					    return score
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user