readabilite: test to replace <br/> with div
parent
c6c113b8a8
commit
e0f533ca31
|
@ -117,5 +117,34 @@ def clean_html(root):
|
|||
del item.attrib[attrib]
|
||||
|
||||
|
||||
def br2p(root):
|
||||
for item in root.iterfind('.//br'):
|
||||
parent = item.getparent()
|
||||
if parent is None:
|
||||
continue
|
||||
|
||||
gdparent = parent.getparent()
|
||||
if gdparent is None:
|
||||
continue
|
||||
|
||||
if item.tail is None:
|
||||
# if <br/> is at the end of a div (to avoid having <p/>)
|
||||
continue
|
||||
|
||||
else:
|
||||
# set up new item
|
||||
new_item = lxml.html.Element(parent.tag)
|
||||
new_item.text = item.tail
|
||||
|
||||
for child in item.itersiblings():
|
||||
new_item.append(child)
|
||||
|
||||
# delete br
|
||||
item.tail = None
|
||||
parent.remove(item)
|
||||
|
||||
gdparent.insert(gdparent.index(parent)+1, new_item)
|
||||
|
||||
|
||||
def get_article(data):
|
||||
return lxml.etree.tostring(get_best_node(parse(data)))
|
||||
|
|
Loading…
Reference in New Issue