From 1d6d0b8ff1317797a4c74731ac72fd14e5be9d61 Mon Sep 17 00:00:00 2001 From: pictuga Date: Thu, 25 Oct 2018 01:09:15 +0200 Subject: [PATCH] readabilite: move br2p in the cleaning code --- morss/readabilite.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index b4c43e8..50f64ec 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -201,20 +201,14 @@ def clean_node(node): if attrib not in attributes_fine: del node.attrib[attrib] - -def br2p(root): - for node in list(root.iterfind('.//br')): - parent = node.getparent() - if parent is None: - continue - - gdparent = parent.getparent() + # br2p + if node.tag == 'br': if gdparent is None: - continue + return - if node.tail is None: + if not count_words(node.tail): # if
is at the end of a div (to avoid having

) - continue + return else: # set up new node @@ -267,7 +261,6 @@ def get_best_node(grades): def get_article(data, url=None, encoding=None): html = parse(data, encoding) - br2p(html) scores = score_all(html) if not len(scores):