readabilite: improve cleaning code

master
pictuga 2018-10-25 01:07:25 +02:00
parent f044c242ef
commit 58fe5243af
1 changed files with 41 additions and 21 deletions

View File

@ -133,44 +133,64 @@ def write_score_all(root, grades):
def clean_node(node):
# Step 1. Do we keep the node?
parent = node.getparent()
if node.getparent() is None:
# this is <html/>
if parent is None:
# this is <html/> (or a removed element waiting for GC)
return
if node.tag in tags_junk:
gdparent = parent.getparent()
# remove shitty tags
if node.tag in tags_junk:
parent.remove(node)
return
# remove shitty class/id FIXME TODO too efficient, might want to add a toggle
class_id = node.get('class', '') + node.get('id', '')
if len(regex_bad.findall(class_id)) >= 2:
node.getparent().remove(node)
return
# Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
# remove shitty link
if node.tag == 'a' and len(list(node.iter())) > 3:
parent.remove(node)
return
if node.tag in ['div'] \
and len(list(node.iterchildren())) <= 1 \
and not (node.text or '').strip() \
and not (node.tail or '').strip():
# remove comments
if isinstance(node, lxml.html.HtmlComment):
parent.remove(node)
return
# remove if too many kids & too high link density
wc = count_words(node.text_content())
if wc != 0 and len(list(node.iter())) > 3:
wca = count_words(' '.join([x.text_content() for x in node.findall('.//a')]))
if float(wca)/wc > 0.8:
parent.remove(node)
return
# squash text-less elements shells
if node.tag in tags_void:
# keep 'em
pass
elif node.tag in tags_meaning:
# remove if content-less
if not count_content(node):
parent.remove(node)
return
else:
# squash non-meaningful if no direct text
content = (node.text or '') + ' '.join([child.tail or '' for child in node])
if not count_words(content):
node.drop_tag()
return
class_id = node.get('class', '') + node.get('id', '')
if len(regex_junk.findall(class_id)) >= 2:
# remove shitty class/id
node.getparent().remove(node)
return
if node.tag == 'a' and len(list(node.iter())) > 3:
# shitty link
node.getparent().remove(node)
return
if isinstance(node, lxml.html.HtmlComment):
# remove comments
node.getparent().remove(node)
return
# Step 2. Clean the node's attributes
# for http://vice.com/fr/
if node.tag == 'img' and 'data-src' in node.attrib:
node.attrib['src'] = node.attrib['data-src']
# clean the node's attributes
for attrib in node.attrib:
if attrib not in attributes_fine:
del node.attrib[attrib]