readabilite: improve cleaning code
This commit is contained in:
		@@ -133,44 +133,64 @@ def write_score_all(root, grades):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean_node(node):
 | 
					def clean_node(node):
 | 
				
			||||||
    # Step 1. Do we keep the node?
 | 
					    parent = node.getparent()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if node.getparent() is None:
 | 
					    if parent is None:
 | 
				
			||||||
        # this is <html/>
 | 
					        # this is <html/> (or a removed element waiting for GC)
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    gdparent = parent.getparent()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove shitty tags
 | 
				
			||||||
    if node.tag in tags_junk:
 | 
					    if node.tag in tags_junk:
 | 
				
			||||||
        # remove shitty tags
 | 
					        parent.remove(node)
 | 
				
			||||||
        node.getparent().remove(node)
 | 
					 | 
				
			||||||
        return
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if node.tag in ['div'] \
 | 
					 | 
				
			||||||
        and len(list(node.iterchildren())) <= 1 \
 | 
					 | 
				
			||||||
        and not (node.text or '').strip() \
 | 
					 | 
				
			||||||
        and not (node.tail or '').strip():
 | 
					 | 
				
			||||||
        node.drop_tag()
 | 
					 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove shitty class/id FIXME TODO too efficient, might want to add a toggle
 | 
				
			||||||
    class_id = node.get('class', '') + node.get('id', '')
 | 
					    class_id = node.get('class', '') + node.get('id', '')
 | 
				
			||||||
    if len(regex_junk.findall(class_id)) >= 2:
 | 
					    if len(regex_bad.findall(class_id)) >= 2:
 | 
				
			||||||
        # remove shitty class/id
 | 
					 | 
				
			||||||
        node.getparent().remove(node)
 | 
					        node.getparent().remove(node)
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove shitty link
 | 
				
			||||||
    if node.tag == 'a' and len(list(node.iter())) > 3:
 | 
					    if node.tag == 'a' and len(list(node.iter())) > 3:
 | 
				
			||||||
        # shitty link
 | 
					        parent.remove(node)
 | 
				
			||||||
        node.getparent().remove(node)
 | 
					 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove comments
 | 
				
			||||||
    if isinstance(node, lxml.html.HtmlComment):
 | 
					    if isinstance(node, lxml.html.HtmlComment):
 | 
				
			||||||
        # remove comments
 | 
					        parent.remove(node)
 | 
				
			||||||
        node.getparent().remove(node)
 | 
					 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Step 2. Clean the node's attributes
 | 
					    # remove if too many kids & too high link density
 | 
				
			||||||
 | 
					    wc = count_words(node.text_content())
 | 
				
			||||||
 | 
					    if wc != 0 and len(list(node.iter())) > 3:
 | 
				
			||||||
 | 
					        wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
 | 
				
			||||||
 | 
					        if float(wca)/wc > 0.8:
 | 
				
			||||||
 | 
					            parent.remove(node)
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # squash text-less elements shells
 | 
				
			||||||
 | 
					    if node.tag in tags_void:
 | 
				
			||||||
 | 
					        # keep 'em
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					    elif node.tag in tags_meaning:
 | 
				
			||||||
 | 
					        # remove if content-less
 | 
				
			||||||
 | 
					        if not count_content(node):
 | 
				
			||||||
 | 
					            parent.remove(node)
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # squash non-meaningful if no direct text
 | 
				
			||||||
 | 
					        content = (node.text or '') + ' '.join([child.tail or '' for child in node])
 | 
				
			||||||
 | 
					        if not count_words(content):
 | 
				
			||||||
 | 
					            node.drop_tag()
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # for http://vice.com/fr/
 | 
				
			||||||
 | 
					    if node.tag == 'img' and 'data-src' in node.attrib:
 | 
				
			||||||
 | 
					        node.attrib['src'] = node.attrib['data-src']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # clean the node's attributes
 | 
				
			||||||
    for attrib in node.attrib:
 | 
					    for attrib in node.attrib:
 | 
				
			||||||
        if attrib not in attributes_fine:
 | 
					        if attrib not in attributes_fine:
 | 
				
			||||||
            del node.attrib[attrib]
 | 
					            del node.attrib[attrib]
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user