Compare commits

..

No commits in common. "f6bc23927fbf6f9b5e39ca78403ce50af87cc7d9" and "6a0531ca033d5936c023dc27b27d74fb5392cde6" have entirely different histories.

2 changed files with 6 additions and 16 deletions

View File

@ -401,8 +401,7 @@ class ParserXML(ParserBase):
return return
elif key is not None: elif key is not None:
if key in match.attrib: del x.attrib[key]
del match.attrib[key]
else: else:
match.getparent().remove(match) match.getparent().remove(match)

View File

@ -70,10 +70,9 @@ class_good = ['and', 'article', 'body', 'column', 'main',
regex_good = re.compile('|'.join(class_good), re.I) regex_good = re.compile('|'.join(class_good), re.I)
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta'] tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet', 'button', 'footer', 'link', 'meta']
'form', 'input', 'textarea', 'button', 'footer']
tags_bad = tags_junk + ['a', 'aside'] tags_bad = tags_junk + ['a', 'aside']
@ -107,9 +106,6 @@ def score_node(node):
or isinstance(node, lxml.html.HtmlProcessingInstruction)): or isinstance(node, lxml.html.HtmlProcessingInstruction)):
return 0 return 0
if node.tag in tags_dangerous:
return 0
if node.tag in tags_junk: if node.tag in tags_junk:
score += -1 # actuall -2 as tags_junk is included tags_bad score += -1 # actuall -2 as tags_junk is included tags_bad
@ -193,11 +189,6 @@ def clean_node(node, keep_threshold=None):
# this is <html/> (or a removed element waiting for GC) # this is <html/> (or a removed element waiting for GC)
return return
# remove dangerous tags, no matter what
if node.tag in tags_dangerous:
parent.remove(node)
return
if keep_threshold is not None and get_score(node) >= keep_threshold: if keep_threshold is not None and get_score(node) >= keep_threshold:
# high score, so keep # high score, so keep
return return
@ -316,14 +307,14 @@ def get_best_node(ranked_grades):
return lowest return lowest
def get_article(data, url=None, encoding=None, debug=False, threshold=5): def get_article(data, url=None, encoding=None, debug=False):
" Input a raw html string, returns a raw html string of the article " " Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding) html = parse(data, encoding)
score_all(html) score_all(html)
scores = rank_grades(get_all_scores(html)) scores = rank_grades(get_all_scores(html))
if not len(scores) or scores[0][1] < threshold: if not len(scores):
return None return None
best = get_best_node(scores) best = get_best_node(scores)