|
|
|
@ -207,8 +207,10 @@ def clean_root(root, keep_threshold=None):
|
|
|
|
|
def clean_node(node, keep_threshold=None):
|
|
|
|
|
parent = node.getparent()
|
|
|
|
|
|
|
|
|
|
# remove comments
|
|
|
|
|
if (isinstance(node, lxml.html.HtmlComment)
|
|
|
|
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
|
|
|
|
parent.remove(node)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if parent is None:
|
|
|
|
@ -242,11 +244,6 @@ def clean_node(node, keep_threshold=None):
|
|
|
|
|
parent.remove(node)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# remove comments
|
|
|
|
|
if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
|
|
|
|
|
parent.remove(node)
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# remove if too many kids & too high link density
|
|
|
|
|
wc = count_words(node.text_content())
|
|
|
|
|
if wc != 0 and len(list(node.iter())) > 3:
|
|
|
|
|