From 83dd2925d32bc31a71796797bc5e25d006373940 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 12 May 2020 14:15:53 +0200 Subject: [PATCH] readabilite: better parsing Keeping blank_text keeps the tree more as-it, making the final output closer to expectations --- morss/readabilite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index a4514b6..26bdc4c 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -11,7 +11,7 @@ def parse(data, encoding=None): else: data = BeautifulSoup(data, 'lxml').prettify('utf-8') - parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8') + parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8') return lxml.html.fromstring(data, parser=parser) @@ -101,7 +101,7 @@ def score_node(node): " Score individual node " score = 0 - class_id = node.get('class', '') + node.get('id', '') + class_id = (node.get('class') or '') + (node.get('id') or '') if (isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction)):