readabilite: better parsing
Keeping blank_text keeps the tree more as-it, making the final output closer to expectations
This commit is contained in:
		@@ -11,7 +11,7 @@ def parse(data, encoding=None):
 | 
				
			|||||||
    else:
 | 
					    else:
 | 
				
			||||||
        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
 | 
					        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
 | 
					    parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return lxml.html.fromstring(data, parser=parser)
 | 
					    return lxml.html.fromstring(data, parser=parser)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -101,7 +101,7 @@ def score_node(node):
 | 
				
			|||||||
    " Score individual node "
 | 
					    " Score individual node "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    score = 0
 | 
					    score = 0
 | 
				
			||||||
    class_id = node.get('class', '') + node.get('id', '')
 | 
					    class_id = (node.get('class') or '') + (node.get('id') or '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (isinstance(node, lxml.html.HtmlComment)
 | 
					    if (isinstance(node, lxml.html.HtmlComment)
 | 
				
			||||||
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
 | 
					            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user