readabilite: fix iter while iterating
parent
e65c88abf8
commit
4a5150e030
|
@ -104,18 +104,21 @@ def get_best_node(root):
|
||||||
|
|
||||||
|
|
||||||
def clean_html(root):
|
def clean_html(root):
|
||||||
for item in root.iter():
|
for item in list(root.iter()): # list() needed to be able to remove elements while iterating
|
||||||
# Step 1. Do we keep the node?
|
# Step 1. Do we keep the node?
|
||||||
|
|
||||||
if item.tag in tags_junk:
|
if item.tag in tags_junk:
|
||||||
item.getparent().remove(item)
|
item.getparent().remove(item)
|
||||||
|
continue
|
||||||
|
|
||||||
class_id = item.get('class', '') + item.get('id', '')
|
class_id = item.get('class', '') + item.get('id', '')
|
||||||
if regex_bad.match(class_id) is not None:
|
if regex_bad.match(class_id) is not None:
|
||||||
item.getparent().remove(item)
|
item.getparent().remove(item)
|
||||||
|
continue
|
||||||
|
|
||||||
if isinstance(item, lxml.html.HtmlComment):
|
if isinstance(item, lxml.html.HtmlComment):
|
||||||
item.getparent().remove(item)
|
item.getparent().remove(item)
|
||||||
|
continue
|
||||||
|
|
||||||
# Step 2. Clean the node's attributes
|
# Step 2. Clean the node's attributes
|
||||||
|
|
||||||
|
@ -125,7 +128,7 @@ def clean_html(root):
|
||||||
|
|
||||||
|
|
||||||
def br2p(root):
|
def br2p(root):
|
||||||
for item in root.iterfind('.//br'):
|
for item in list(root.iterfind('.//br')):
|
||||||
parent = item.getparent()
|
parent = item.getparent()
|
||||||
if parent is None:
|
if parent is None:
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in New Issue