Compare commits
4 Commits
d90756b337
...
e136b0feb2
Author | SHA1 | Date |
---|---|---|
pictuga | e136b0feb2 | |
pictuga | 6cf32af6c0 | |
pictuga | 568e7d7dd2 | |
pictuga | 3617f86e9d |
|
@ -442,7 +442,7 @@ class ParserHTML(ParserXML):
|
||||||
|
|
||||||
def parse(self, raw):
|
def parse(self, raw):
|
||||||
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
|
||||||
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify(), parser)
|
return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
|
||||||
|
|
||||||
def tostring(self, encoding='unicode', **k):
|
def tostring(self, encoding='unicode', **k):
|
||||||
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
return lxml.html.tostring(self.root, encoding=encoding, **k)
|
||||||
|
|
|
@ -720,7 +720,7 @@ def cgi_error_handler(environ, start_response, app):
|
||||||
@middleware
|
@middleware
|
||||||
def cgi_encode(environ, start_response, app):
|
def cgi_encode(environ, start_response, app):
|
||||||
out = app(environ, start_response)
|
out = app(environ, start_response)
|
||||||
return [x if isinstance(x, bytes) else x.encode('utf-8') for x in out]
|
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
|
||||||
|
|
||||||
|
|
||||||
def cli_app():
|
def cli_app():
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +10,7 @@ def parse(data, encoding=None):
|
||||||
else:
|
else:
|
||||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
||||||
|
|
||||||
return lxml.html.fromstring(data, parser=parser)
|
return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
|
||||||
|
|
||||||
|
|
||||||
def count_words(string):
|
def count_words(string):
|
||||||
|
@ -62,7 +63,7 @@ regex_good = re.compile('|'.join(class_good), re.I)
|
||||||
|
|
||||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
|
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
|
||||||
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
|
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
|
||||||
'button', 'footer']
|
'button', 'footer', 'link', 'meta']
|
||||||
|
|
||||||
tags_bad = tags_junk + ['a', 'aside']
|
tags_bad = tags_junk + ['a', 'aside']
|
||||||
|
|
||||||
|
@ -93,11 +94,18 @@ def score_node(node):
|
||||||
class_id = node.get('class', '') + node.get('id', '')
|
class_id = node.get('class', '') + node.get('id', '')
|
||||||
|
|
||||||
if (isinstance(node, lxml.html.HtmlComment)
|
if (isinstance(node, lxml.html.HtmlComment)
|
||||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
or node.tag in tags_bad
|
|
||||||
or regex_bad.search(class_id)):
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
if node.tag in tags_junk:
|
||||||
|
score += -1 # actuall -2 as tags_junk is included tags_bad
|
||||||
|
|
||||||
|
if node.tag in tags_bad:
|
||||||
|
score += -1
|
||||||
|
|
||||||
|
if regex_bad.search(class_id):
|
||||||
|
score += -1
|
||||||
|
|
||||||
if node.tag in tags_good:
|
if node.tag in tags_good:
|
||||||
score += 4
|
score += 4
|
||||||
|
|
||||||
|
@ -125,7 +133,7 @@ def score_all(node, grades=None):
|
||||||
score = score_node(child)
|
score = score_node(child)
|
||||||
child.attrib['seen'] = 'yes, ' + str(int(score))
|
child.attrib['seen'] = 'yes, ' + str(int(score))
|
||||||
|
|
||||||
if score > 0:
|
if score > 0 or not len(grades):
|
||||||
spread_score(child, score, grades)
|
spread_score(child, score, grades)
|
||||||
score_all(child, grades)
|
score_all(child, grades)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue