readabilite: change cleaning & code structure

Kinda struggled to make some "nice" code
master
pictuga 2017-07-17 00:27:41 +02:00
parent 386bafd391
commit 3bfad54add
1 changed files with 71 additions and 39 deletions

View File

@ -98,60 +98,95 @@ def score_node(node):
def score_all(root): def score_all(root):
grades = {} grades = {}
for item in root.iter(): for node in list(root.iter()):
score = score_node(item) score = score_node(node)
grades[item] = score parent = node.getparent()
clean_node(node)
if parent is not None and node.getparent() is None:
# if the node got deleted/dropped (else, nothing to do)
# maybe now the parent only contains 1 item and needs to be flattened?
gdparent = parent.getparent()
clean_node(parent)
if gdparent is not None and parent.getparent() is None:
# if the parent got deleted/dropped
spread_score(gdparent, score + grades[parent], grades)
factor = 2
for ancestor in item.iterancestors():
if score / factor > 1:
grades[ancestor] += score / factor
factor *= 2
else: else:
break # if the parent was kept
spread_score(parent, score, grades)
else:
# if the node was kept
spread_score(node, score, grades)
return grades return grades
def spread_score(node, score, grades):
for ancestor in [node,] + list(node.iterancestors()):
if score >= 1 or ancestor is node:
try:
grades[ancestor] += score
except KeyError:
grades[ancestor] = score
score /= 2
else:
break
def write_score_all(root, grades): def write_score_all(root, grades):
for node in root.iter(): for node in root.iter():
node.attrib['score'] = str(int(grades[node])) node.attrib['score'] = str(int(grades[node]))
def clean_html(root): def clean_node(node):
for item in list(root.iter()): # list() needed to be able to remove elements while iterating
# Step 1. Do we keep the node? # Step 1. Do we keep the node?
if item.tag in tags_junk: if node.getparent() is None:
# this is <html/>
return
if node.tag in tags_junk:
# remove shitty tags # remove shitty tags
item.getparent().remove(item) node.getparent().remove(node)
continue return
if item.tag in ['div'] \ # Turn <div><p>Bla bla bla</p></div> into <p>Bla bla bla</p>
and len(list(item.iterchildren())) <= 1 \
and not (item.text or '').strip() \
and not (item.tail or '').strip():
# remove div with only one item inside
item.drop_tag()
continue
class_id = item.get('class', '') + item.get('id', '') if node.tag in ['div'] \
if regex_bad.match(class_id) is not None: and len(list(node.iterchildren())) <= 1 \
and not (node.text or '').strip() \
and not (node.tail or '').strip():
node.drop_tag()
return
class_id = node.get('class', '') + node.get('id', '')
if len(regex_junk.findall(class_id)) >= 2:
# remove shitty class/id # remove shitty class/id
item.getparent().remove(item) node.getparent().remove(node)
continue return
if isinstance(item, lxml.html.HtmlComment): if node.tag == 'a' and len(list(node.iter())) > 3:
# shitty link
node.getparent().remove(node)
return
if isinstance(node, lxml.html.HtmlComment):
# remove comments # remove comments
item.getparent().remove(item) node.getparent().remove(node)
continue return
# Step 2. Clean the node's attributes # Step 2. Clean the node's attributes
for attrib in item.attrib: for attrib in node.attrib:
if attrib not in attributes_fine: if attrib not in attributes_fine:
del item.attrib[attrib] del node.attrib[attrib]
def br2p(root): def br2p(root):
@ -219,10 +254,7 @@ def get_best_node(grades, highlight=False):
def get_article(data, url=None, encoding=None): def get_article(data, url=None, encoding=None):
html = parse(data, encoding) html = parse(data, encoding)
clean_html(html)
br2p(html) br2p(html)
scores = score_all(html) scores = score_all(html)
best = get_best_node(scores) best = get_best_node(scores)