readabilite: drop dangerous tags (script, style)
This commit is contained in:
		@@ -70,9 +70,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
 | 
				
			|||||||
regex_good = re.compile('|'.join(class_good), re.I)
 | 
					regex_good = re.compile('|'.join(class_good), re.I)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
 | 
					tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
 | 
				
			||||||
    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
 | 
					
 | 
				
			||||||
    'button', 'footer', 'link', 'meta']
 | 
					tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
 | 
				
			||||||
 | 
					    'form', 'input', 'textarea', 'button', 'footer']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
tags_bad = tags_junk + ['a', 'aside']
 | 
					tags_bad = tags_junk + ['a', 'aside']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -106,6 +107,9 @@ def score_node(node):
 | 
				
			|||||||
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
 | 
					            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if node.tag in tags_dangerous:
 | 
				
			||||||
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if node.tag in tags_junk:
 | 
					    if node.tag in tags_junk:
 | 
				
			||||||
        score += -1 # actuall -2 as tags_junk is included tags_bad
 | 
					        score += -1 # actuall -2 as tags_junk is included tags_bad
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -189,6 +193,11 @@ def clean_node(node, keep_threshold=None):
 | 
				
			|||||||
        # this is <html/> (or a removed element waiting for GC)
 | 
					        # this is <html/> (or a removed element waiting for GC)
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove dangerous tags, no matter what
 | 
				
			||||||
 | 
					    if node.tag in tags_dangerous:
 | 
				
			||||||
 | 
					        parent.remove(node)
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if keep_threshold is not None and get_score(node) >= keep_threshold:
 | 
					    if keep_threshold is not None and get_score(node) >= keep_threshold:
 | 
				
			||||||
        # high score, so keep
 | 
					        # high score, so keep
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user