readabilite: move br2p in the cleaning code

master
pictuga 2018-10-25 01:09:15 +02:00
parent 7d005e9a65
commit 1d6d0b8ff1
1 changed files with 5 additions and 12 deletions

View File

@ -201,20 +201,14 @@ def clean_node(node):
if attrib not in attributes_fine: if attrib not in attributes_fine:
del node.attrib[attrib] del node.attrib[attrib]
# br2p
def br2p(root): if node.tag == 'br':
for node in list(root.iterfind('.//br')):
parent = node.getparent()
if parent is None:
continue
gdparent = parent.getparent()
if gdparent is None: if gdparent is None:
continue return
if node.tail is None: if not count_words(node.tail):
# if <br/> is at the end of a div (to avoid having <p/>) # if <br/> is at the end of a div (to avoid having <p/>)
continue return
else: else:
# set up new node # set up new node
@ -267,7 +261,6 @@ def get_best_node(grades):
def get_article(data, url=None, encoding=None): def get_article(data, url=None, encoding=None):
html = parse(data, encoding) html = parse(data, encoding)
br2p(html)
scores = score_all(html) scores = score_all(html)
if not len(scores): if not len(scores):