feeds: parse html with BS

More robust & to make it consistent with :getpage
master
pictuga 2020-04-05 16:12:41 +02:00
parent 99461ea185
commit 40c69f17d2
1 changed files with 2 additions and 1 deletions

View File

@ -15,6 +15,7 @@ import dateutil.parser
from copy import deepcopy
import lxml.html
from bs4 import BeautifulSoup
json.encoder.c_make_encoder = None
@ -441,7 +442,7 @@ class ParserHTML(ParserXML):
def parse(self, raw):
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
return etree.fromstring(raw, parser)
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
def tostring(self, encoding='unicode', **k):
return lxml.html.tostring(self.root, encoding=encoding, **k)