From 9eb19fac04037443d8c4de79ccb89c2b34ab9304 Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 3 Jan 2022 16:26:17 +0000 Subject: [PATCH] readabilite: use custom html parser within bs4's lxml parser Solves the following obscure error: ValueError: Invalid PI name 'b'xml'' --- morss/readabilite.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 608dae4..a7ac92f 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -17,14 +17,20 @@ import re +import bs4.builder._lxml import lxml.etree import lxml.html import lxml.html.soupparser +class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder): + def default_parser(self, encoding): + return lxml.html.HTMLParser(remove_comments=True, encoding=encoding) + + def parse(data, encoding=None): kwargs = {'from_encoding': encoding} if encoding else {} - return lxml.html.soupparser.fromstring(data, features='lxml', **kwargs) + return lxml.html.soupparser.fromstring(data, features='lxml', builder=CustomTreeBuilder, **kwargs) def count_words(string):