readabilite: test to replace <br/> with div
parent
c6c113b8a8
commit
e0f533ca31
|
@ -117,5 +117,34 @@ def clean_html(root):
|
||||||
del item.attrib[attrib]
|
del item.attrib[attrib]
|
||||||
|
|
||||||
|
|
||||||
|
def br2p(root):
|
||||||
|
for item in root.iterfind('.//br'):
|
||||||
|
parent = item.getparent()
|
||||||
|
if parent is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
gdparent = parent.getparent()
|
||||||
|
if gdparent is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if item.tail is None:
|
||||||
|
# if <br/> is at the end of a div (to avoid having <p/>)
|
||||||
|
continue
|
||||||
|
|
||||||
|
else:
|
||||||
|
# set up new item
|
||||||
|
new_item = lxml.html.Element(parent.tag)
|
||||||
|
new_item.text = item.tail
|
||||||
|
|
||||||
|
for child in item.itersiblings():
|
||||||
|
new_item.append(child)
|
||||||
|
|
||||||
|
# delete br
|
||||||
|
item.tail = None
|
||||||
|
parent.remove(item)
|
||||||
|
|
||||||
|
gdparent.insert(gdparent.index(parent)+1, new_item)
|
||||||
|
|
||||||
|
|
||||||
def get_article(data):
|
def get_article(data):
|
||||||
return lxml.etree.tostring(get_best_node(parse(data)))
|
return lxml.etree.tostring(get_best_node(parse(data)))
|
||||||
|
|
Loading…
Reference in New Issue