readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectationsmaster
parent
e09d0abf54
commit
83dd2925d3
|
@ -11,7 +11,7 @@ def parse(data, encoding=None):
|
||||||
else:
|
else:
|
||||||
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
|
||||||
|
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
|
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
|
||||||
|
|
||||||
return lxml.html.fromstring(data, parser=parser)
|
return lxml.html.fromstring(data, parser=parser)
|
||||||
|
|
||||||
|
@ -101,7 +101,7 @@ def score_node(node):
|
||||||
" Score individual node "
|
" Score individual node "
|
||||||
|
|
||||||
score = 0
|
score = 0
|
||||||
class_id = node.get('class', '') + node.get('id', '')
|
class_id = (node.get('class') or '') + (node.get('id') or '')
|
||||||
|
|
||||||
if (isinstance(node, lxml.html.HtmlComment)
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
|
|
Loading…
Reference in New Issue