readabilite: change cleaning & code structure
Kinda struggled to make some "nice" code
This commit is contained in:
		@@ -98,60 +98,95 @@ def score_node(node):
 | 
				
			|||||||
def score_all(root):
 | 
					def score_all(root):
 | 
				
			||||||
    grades = {}
 | 
					    grades = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for item in root.iter():
 | 
					    for node in list(root.iter()):
 | 
				
			||||||
        score = score_node(item)
 | 
					        score = score_node(node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        grades[item] = score
 | 
					        parent = node.getparent()
 | 
				
			||||||
 | 
					        clean_node(node)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if parent is not None and node.getparent() is None:
 | 
				
			||||||
 | 
					            # if the node got deleted/dropped (else, nothing to do)
 | 
				
			||||||
 | 
					            # maybe now the parent only contains 1 item and needs to be flattened?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            gdparent = parent.getparent()
 | 
				
			||||||
 | 
					            clean_node(parent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if gdparent is not None and parent.getparent() is None:
 | 
				
			||||||
 | 
					                # if the parent got deleted/dropped
 | 
				
			||||||
 | 
					                spread_score(gdparent, score + grades[parent], grades)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        factor = 2
 | 
					 | 
				
			||||||
        for ancestor in item.iterancestors():
 | 
					 | 
				
			||||||
            if score / factor > 1:
 | 
					 | 
				
			||||||
                grades[ancestor] += score / factor
 | 
					 | 
				
			||||||
                factor *= 2
 | 
					 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                break
 | 
					                # if the parent was kept
 | 
				
			||||||
 | 
					                spread_score(parent, score, grades)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # if the node was kept
 | 
				
			||||||
 | 
					            spread_score(node, score, grades)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return grades
 | 
					    return grades
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def spread_score(node, score, grades):
 | 
				
			||||||
 | 
					    for ancestor in [node,] + list(node.iterancestors()):
 | 
				
			||||||
 | 
					        if score >= 1 or ancestor is node:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                grades[ancestor] += score
 | 
				
			||||||
 | 
					            except KeyError:
 | 
				
			||||||
 | 
					                grades[ancestor] = score
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            score /= 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def write_score_all(root, grades):
 | 
					def write_score_all(root, grades):
 | 
				
			||||||
    for node in root.iter():
 | 
					    for node in root.iter():
 | 
				
			||||||
        node.attrib['score'] = str(int(grades[node]))
 | 
					        node.attrib['score'] = str(int(grades[node]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def clean_html(root):
 | 
					def clean_node(node):
 | 
				
			||||||
    for item in list(root.iter()): # list() needed to be able to remove elements while iterating
 | 
					 | 
				
			||||||
    # Step 1. Do we keep the node?
 | 
					    # Step 1. Do we keep the node?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if item.tag in tags_junk:
 | 
					    if node.getparent() is None:
 | 
				
			||||||
 | 
					        # this is <html/>
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if node.tag in tags_junk:
 | 
				
			||||||
        # remove shitty tags
 | 
					        # remove shitty tags
 | 
				
			||||||
            item.getparent().remove(item)
 | 
					        node.getparent().remove(node)
 | 
				
			||||||
            continue
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if item.tag in ['div'] \
 | 
					    # Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
 | 
				
			||||||
            and len(list(item.iterchildren())) <= 1 \
 | 
					 | 
				
			||||||
            and not (item.text or '').strip() \
 | 
					 | 
				
			||||||
            and not (item.tail or '').strip():
 | 
					 | 
				
			||||||
            # remove div with only one item inside
 | 
					 | 
				
			||||||
            item.drop_tag()
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        class_id = item.get('class', '') + item.get('id', '')
 | 
					    if node.tag in ['div'] \
 | 
				
			||||||
        if regex_bad.match(class_id) is not None:
 | 
					        and len(list(node.iterchildren())) <= 1 \
 | 
				
			||||||
 | 
					        and not (node.text or '').strip() \
 | 
				
			||||||
 | 
					        and not (node.tail or '').strip():
 | 
				
			||||||
 | 
					        node.drop_tag()
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class_id = node.get('class', '') + node.get('id', '')
 | 
				
			||||||
 | 
					    if len(regex_junk.findall(class_id)) >= 2:
 | 
				
			||||||
        # remove shitty class/id
 | 
					        # remove shitty class/id
 | 
				
			||||||
            item.getparent().remove(item)
 | 
					        node.getparent().remove(node)
 | 
				
			||||||
            continue
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if isinstance(item, lxml.html.HtmlComment):
 | 
					    if node.tag == 'a' and len(list(node.iter())) > 3:
 | 
				
			||||||
 | 
					        # shitty link
 | 
				
			||||||
 | 
					        node.getparent().remove(node)
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if isinstance(node, lxml.html.HtmlComment):
 | 
				
			||||||
        # remove comments
 | 
					        # remove comments
 | 
				
			||||||
            item.getparent().remove(item)
 | 
					        node.getparent().remove(node)
 | 
				
			||||||
            continue
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Step 2. Clean the node's attributes
 | 
					    # Step 2. Clean the node's attributes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for attrib in item.attrib:
 | 
					    for attrib in node.attrib:
 | 
				
			||||||
        if attrib not in attributes_fine:
 | 
					        if attrib not in attributes_fine:
 | 
				
			||||||
                del item.attrib[attrib]
 | 
					            del node.attrib[attrib]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def br2p(root):
 | 
					def br2p(root):
 | 
				
			||||||
@@ -219,10 +254,7 @@ def get_best_node(grades, highlight=False):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def get_article(data, url=None, encoding=None):
 | 
					def get_article(data, url=None, encoding=None):
 | 
				
			||||||
    html = parse(data, encoding)
 | 
					    html = parse(data, encoding)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    clean_html(html)
 | 
					 | 
				
			||||||
    br2p(html)
 | 
					    br2p(html)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    scores = score_all(html)
 | 
					    scores = score_all(html)
 | 
				
			||||||
    best = get_best_node(scores)
 | 
					    best = get_best_node(scores)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user