readabilite: test to replace <br/> with div
This commit is contained in:
		@@ -117,5 +117,34 @@ def clean_html(root):
 | 
				
			|||||||
                del item.attrib[attrib]
 | 
					                del item.attrib[attrib]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def br2p(root):
 | 
				
			||||||
 | 
					    for item in root.iterfind('.//br'):
 | 
				
			||||||
 | 
					        parent = item.getparent()
 | 
				
			||||||
 | 
					        if parent is None:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        gdparent = parent.getparent()
 | 
				
			||||||
 | 
					        if gdparent is None:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if item.tail is None:
 | 
				
			||||||
 | 
					            # if <br/> is at the end of a div (to avoid having <p/>)
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # set up new item
 | 
				
			||||||
 | 
					            new_item = lxml.html.Element(parent.tag)
 | 
				
			||||||
 | 
					            new_item.text = item.tail
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for child in item.itersiblings():
 | 
				
			||||||
 | 
					                new_item.append(child)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # delete br
 | 
				
			||||||
 | 
					            item.tail = None
 | 
				
			||||||
 | 
					            parent.remove(item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            gdparent.insert(gdparent.index(parent)+1, new_item)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_article(data):
 | 
					def get_article(data):
 | 
				
			||||||
    return lxml.etree.tostring(get_best_node(parse(data)))
 | 
					    return lxml.etree.tostring(get_best_node(parse(data)))
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user