readabilite: avoid double parsing of html
continuous-integration/drone/push Build is passing Details

master
pictuga 2022-01-01 12:51:30 +01:00
parent 87d2fe772d
commit afc31eb6e9
1 changed files with 3 additions and 10 deletions

View File

@ -19,19 +19,12 @@ import re
import lxml.etree import lxml.etree
import lxml.html import lxml.html
from bs4 import BeautifulSoup import lxml.html.soupparser
def parse(data, encoding=None): def parse(data, encoding=None):
if encoding: kwargs = {'from_encoding': encoding} if encoding else {}
data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8') return lxml.html.soupparser.fromstring(data, **kwargs)
else:
data = BeautifulSoup(data, 'lxml').prettify('utf-8')
parser = lxml.html.HTMLParser(remove_comments=True, encoding='utf-8')
return lxml.html.fromstring(data, parser=parser)
def count_words(string): def count_words(string):