readabilite: better parsing

Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
master
pictuga 2020-05-12 14:15:53 +02:00
parent e09d0abf54
commit 83dd2925d3
1 changed files with 2 additions and 2 deletions

View File

@ -11,7 +11,7 @@ def parse(data, encoding=None):
else: else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8') data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8') parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser) return lxml.html.fromstring(data, parser=parser)
@ -101,7 +101,7 @@ def score_node(node):
" Score individual node " " Score individual node "
score = 0 score = 0
class_id = node.get('class', '') + node.get('id', '') class_id = (node.get('class') or '') + (node.get('id') or '')
if (isinstance(node, lxml.html.HtmlComment) if (isinstance(node, lxml.html.HtmlComment)
or isinstance(node, lxml.html.HtmlProcessingInstruction)): or isinstance(node, lxml.html.HtmlProcessingInstruction)):