readabilite: shift "good" tags to a var (list)
So that this list can later be re-usedmaster
parent
b14381f575
commit
e71fc967ce
|
@ -42,11 +42,12 @@ regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
|
||||||
'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
|
'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
|
||||||
'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I)
|
'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I)
|
||||||
|
|
||||||
|
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']
|
||||||
|
|
||||||
def score_node(node):
|
def score_node(node):
|
||||||
score = 0
|
score = 0
|
||||||
|
|
||||||
if node.tag in ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']:
|
if node.tag in tags_junk:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if isinstance(node, lxml.html.HtmlComment):
|
if isinstance(node, lxml.html.HtmlComment):
|
||||||
|
|
Loading…
Reference in New Issue