From 40c69f17d24c5bb21652a62b32923d833fc30ca2 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 5 Apr 2020 16:12:41 +0200 Subject: [PATCH] feeds: parse html with BS More robust & to make it consistent with :getpage --- morss/feeds.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/morss/feeds.py b/morss/feeds.py index c6249a3..d92ff43 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -15,6 +15,7 @@ import dateutil.parser from copy import deepcopy import lxml.html +from bs4 import BeautifulSoup json.encoder.c_make_encoder = None @@ -441,7 +442,7 @@ class ParserHTML(ParserXML): def parse(self, raw): parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print - return etree.fromstring(raw, parser) + return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser) def tostring(self, encoding='unicode', **k): return lxml.html.tostring(self.root, encoding=encoding, **k)