From 6cf32af6c070f16c53d80c8884dda05039bbe66a Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 5 Apr 2020 20:46:42 +0200 Subject: [PATCH] readabilite: also use BS --- morss/readabilite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 3412a57..0768315 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -1,5 +1,6 @@ import lxml.etree import lxml.html +from bs4 import BeautifulSoup import re @@ -9,7 +10,7 @@ def parse(data, encoding=None): else: parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) - return lxml.html.fromstring(data, parser=parser) + return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser) def count_words(string):