readabilite: change cleaning & code structure
Kinda struggled to make some "nice" codemaster
parent
386bafd391
commit
3bfad54add
|
@ -98,60 +98,95 @@ def score_node(node):
|
||||||
def score_all(root):
|
def score_all(root):
|
||||||
grades = {}
|
grades = {}
|
||||||
|
|
||||||
for item in root.iter():
|
for node in list(root.iter()):
|
||||||
score = score_node(item)
|
score = score_node(node)
|
||||||
|
|
||||||
grades[item] = score
|
parent = node.getparent()
|
||||||
|
clean_node(node)
|
||||||
|
|
||||||
|
if parent is not None and node.getparent() is None:
|
||||||
|
# if the node got deleted/dropped (else, nothing to do)
|
||||||
|
# maybe now the parent only contains 1 item and needs to be flattened?
|
||||||
|
|
||||||
|
gdparent = parent.getparent()
|
||||||
|
clean_node(parent)
|
||||||
|
|
||||||
|
if gdparent is not None and parent.getparent() is None:
|
||||||
|
# if the parent got deleted/dropped
|
||||||
|
spread_score(gdparent, score + grades[parent], grades)
|
||||||
|
|
||||||
factor = 2
|
|
||||||
for ancestor in item.iterancestors():
|
|
||||||
if score / factor > 1:
|
|
||||||
grades[ancestor] += score / factor
|
|
||||||
factor *= 2
|
|
||||||
else:
|
else:
|
||||||
break
|
# if the parent was kept
|
||||||
|
spread_score(parent, score, grades)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# if the node was kept
|
||||||
|
spread_score(node, score, grades)
|
||||||
|
|
||||||
return grades
|
return grades
|
||||||
|
|
||||||
|
|
||||||
|
def spread_score(node, score, grades):
|
||||||
|
for ancestor in [node,] + list(node.iterancestors()):
|
||||||
|
if score >= 1 or ancestor is node:
|
||||||
|
try:
|
||||||
|
grades[ancestor] += score
|
||||||
|
except KeyError:
|
||||||
|
grades[ancestor] = score
|
||||||
|
|
||||||
|
score /= 2
|
||||||
|
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def write_score_all(root, grades):
|
def write_score_all(root, grades):
|
||||||
for node in root.iter():
|
for node in root.iter():
|
||||||
node.attrib['score'] = str(int(grades[node]))
|
node.attrib['score'] = str(int(grades[node]))
|
||||||
|
|
||||||
|
|
||||||
def clean_html(root):
|
def clean_node(node):
|
||||||
for item in list(root.iter()): # list() needed to be able to remove elements while iterating
|
# Step 1. Do we keep the node?
|
||||||
# Step 1. Do we keep the node?
|
|
||||||
|
|
||||||
if item.tag in tags_junk:
|
if node.getparent() is None:
|
||||||
# remove shitty tags
|
# this is <html/>
|
||||||
item.getparent().remove(item)
|
return
|
||||||
continue
|
|
||||||
|
|
||||||
if item.tag in ['div'] \
|
if node.tag in tags_junk:
|
||||||
and len(list(item.iterchildren())) <= 1 \
|
# remove shitty tags
|
||||||
and not (item.text or '').strip() \
|
node.getparent().remove(node)
|
||||||
and not (item.tail or '').strip():
|
return
|
||||||
# remove div with only one item inside
|
|
||||||
item.drop_tag()
|
|
||||||
continue
|
|
||||||
|
|
||||||
class_id = item.get('class', '') + item.get('id', '')
|
# Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
|
||||||
if regex_bad.match(class_id) is not None:
|
|
||||||
# remove shitty class/id
|
|
||||||
item.getparent().remove(item)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if isinstance(item, lxml.html.HtmlComment):
|
if node.tag in ['div'] \
|
||||||
# remove comments
|
and len(list(node.iterchildren())) <= 1 \
|
||||||
item.getparent().remove(item)
|
and not (node.text or '').strip() \
|
||||||
continue
|
and not (node.tail or '').strip():
|
||||||
|
node.drop_tag()
|
||||||
|
return
|
||||||
|
|
||||||
# Step 2. Clean the node's attributes
|
class_id = node.get('class', '') + node.get('id', '')
|
||||||
|
if len(regex_junk.findall(class_id)) >= 2:
|
||||||
|
# remove shitty class/id
|
||||||
|
node.getparent().remove(node)
|
||||||
|
return
|
||||||
|
|
||||||
for attrib in item.attrib:
|
if node.tag == 'a' and len(list(node.iter())) > 3:
|
||||||
if attrib not in attributes_fine:
|
# shitty link
|
||||||
del item.attrib[attrib]
|
node.getparent().remove(node)
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(node, lxml.html.HtmlComment):
|
||||||
|
# remove comments
|
||||||
|
node.getparent().remove(node)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 2. Clean the node's attributes
|
||||||
|
|
||||||
|
for attrib in node.attrib:
|
||||||
|
if attrib not in attributes_fine:
|
||||||
|
del node.attrib[attrib]
|
||||||
|
|
||||||
|
|
||||||
def br2p(root):
|
def br2p(root):
|
||||||
|
@ -219,10 +254,7 @@ def get_best_node(grades, highlight=False):
|
||||||
|
|
||||||
def get_article(data, url=None, encoding=None):
|
def get_article(data, url=None, encoding=None):
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
|
|
||||||
clean_html(html)
|
|
||||||
br2p(html)
|
br2p(html)
|
||||||
|
|
||||||
scores = score_all(html)
|
scores = score_all(html)
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue