readabilite: use custom html parser within bs4's lxml parser
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
Solves the following obscure error: ValueError: Invalid PI name 'b'xml''master
parent
d424e394d1
commit
9eb19fac04
|
@ -17,14 +17,20 @@
|
|||
|
||||
import re
|
||||
|
||||
import bs4.builder._lxml
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
import lxml.html.soupparser
|
||||
|
||||
|
||||
class CustomTreeBuilder(bs4.builder._lxml.LXMLTreeBuilder):
|
||||
def default_parser(self, encoding):
|
||||
return lxml.html.HTMLParser(remove_comments=True, encoding=encoding)
|
||||
|
||||
|
||||
def parse(data, encoding=None):
|
||||
kwargs = {'from_encoding': encoding} if encoding else {}
|
||||
return lxml.html.soupparser.fromstring(data, features='lxml', **kwargs)
|
||||
return lxml.html.soupparser.fromstring(data, features='lxml', builder=CustomTreeBuilder, **kwargs)
|
||||
|
||||
|
||||
def count_words(string):
|
||||
|
|
Loading…
Reference in New Issue