readabilite: move br2p in the cleaning code
parent
7d005e9a65
commit
1d6d0b8ff1
|
@ -201,20 +201,14 @@ def clean_node(node):
|
||||||
if attrib not in attributes_fine:
|
if attrib not in attributes_fine:
|
||||||
del node.attrib[attrib]
|
del node.attrib[attrib]
|
||||||
|
|
||||||
|
# br2p
|
||||||
def br2p(root):
|
if node.tag == 'br':
|
||||||
for node in list(root.iterfind('.//br')):
|
|
||||||
parent = node.getparent()
|
|
||||||
if parent is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
gdparent = parent.getparent()
|
|
||||||
if gdparent is None:
|
if gdparent is None:
|
||||||
continue
|
return
|
||||||
|
|
||||||
if node.tail is None:
|
if not count_words(node.tail):
|
||||||
# if <br/> is at the end of a div (to avoid having <p/>)
|
# if <br/> is at the end of a div (to avoid having <p/>)
|
||||||
continue
|
return
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# set up new node
|
# set up new node
|
||||||
|
@ -267,7 +261,6 @@ def get_best_node(grades):
|
||||||
|
|
||||||
def get_article(data, url=None, encoding=None):
|
def get_article(data, url=None, encoding=None):
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
br2p(html)
|
|
||||||
scores = score_all(html)
|
scores = score_all(html)
|
||||||
|
|
||||||
if not len(scores):
|
if not len(scores):
|
||||||
|
|
Loading…
Reference in New Issue