readabilite: use custom html parser within bs4's lxml parser
continuous-integration/drone/push Build is passing Details

Solves the following obscure error:
ValueError: Invalid PI name 'b'xml''
master
pictuga 2022-01-03 16:26:17 +00:00
parent d424e394d1
commit 9eb19fac04
1 changed files with 7 additions and 1 deletions

View File

@ -17,14 +17,20 @@
import re import re
import bs4.builder._lxml
import lxml.etree import lxml.etree
import lxml.html import lxml.html
import lxml.html.soupparser import lxml.html.soupparser
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
def default_parser(self, encoding):
return lxml.html.HTMLParser(remove_comments=True, encoding=encoding)
def parse(data, encoding=None): def parse(data, encoding=None):
kwargs = {'from_encoding': encoding} if encoding else {} kwargs = {'from_encoding': encoding} if encoding else {}
return lxml.html.soupparser.fromstring(data, features='lxml', **kwargs) return lxml.html.soupparser.fromstring(data, features='lxml', builder=CustomTreeBuilder, **kwargs)
def count_words(string): def count_words(string):