feeds: parse html with BS
More robust & to make it consistent with :getpage
This commit is contained in:
		@@ -15,6 +15,7 @@ import dateutil.parser
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
 | 
			
		||||
import lxml.html
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
json.encoder.c_make_encoder = None
 | 
			
		||||
 | 
			
		||||
@@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
 | 
			
		||||
 | 
			
		||||
    def parse(self, raw):
 | 
			
		||||
        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
 | 
			
		||||
        return etree.fromstring(raw, parser)
 | 
			
		||||
        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
 | 
			
		||||
 | 
			
		||||
    def tostring(self, encoding='unicode', **k):
 | 
			
		||||
        return lxml.html.tostring(self.root, encoding=encoding, **k)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user