Compare commits
3 Commits
6a0531ca03
...
f6bc23927f
Author | SHA1 | Date |
---|---|---|
pictuga | f6bc23927f | |
pictuga | c86572374e | |
pictuga | 59ef5af9e2 |
|
@ -401,7 +401,8 @@ class ParserXML(ParserBase):
|
||||||
return
|
return
|
||||||
|
|
||||||
elif key is not None:
|
elif key is not None:
|
||||||
del x.attrib[key]
|
if key in match.attrib:
|
||||||
|
del match.attrib[key]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
match.getparent().remove(match)
|
match.getparent().remove(match)
|
||||||
|
|
|
@ -70,9 +70,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
|
||||||
regex_good = re.compile('|'.join(class_good), re.I)
|
regex_good = re.compile('|'.join(class_good), re.I)
|
||||||
|
|
||||||
|
|
||||||
tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
|
tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
|
||||||
'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
|
|
||||||
'button', 'footer', 'link', 'meta']
|
tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
|
||||||
|
'form', 'input', 'textarea', 'button', 'footer']
|
||||||
|
|
||||||
tags_bad = tags_junk + ['a', 'aside']
|
tags_bad = tags_junk + ['a', 'aside']
|
||||||
|
|
||||||
|
@ -106,6 +107,9 @@ def score_node(node):
|
||||||
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
or isinstance(node, lxml.html.HtmlProcessingInstruction)):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
if node.tag in tags_dangerous:
|
||||||
|
return 0
|
||||||
|
|
||||||
if node.tag in tags_junk:
|
if node.tag in tags_junk:
|
||||||
score += -1 # actuall -2 as tags_junk is included tags_bad
|
score += -1 # actuall -2 as tags_junk is included tags_bad
|
||||||
|
|
||||||
|
@ -189,6 +193,11 @@ def clean_node(node, keep_threshold=None):
|
||||||
# this is <html/> (or a removed element waiting for GC)
|
# this is <html/> (or a removed element waiting for GC)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# remove dangerous tags, no matter what
|
||||||
|
if node.tag in tags_dangerous:
|
||||||
|
parent.remove(node)
|
||||||
|
return
|
||||||
|
|
||||||
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
if keep_threshold is not None and get_score(node) >= keep_threshold:
|
||||||
# high score, so keep
|
# high score, so keep
|
||||||
return
|
return
|
||||||
|
@ -307,14 +316,14 @@ def get_best_node(ranked_grades):
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding=None, debug=False):
|
def get_article(data, url=None, encoding=None, debug=False, threshold=5):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
score_all(html)
|
score_all(html)
|
||||||
scores = rank_grades(get_all_scores(html))
|
scores = rank_grades(get_all_scores(html))
|
||||||
|
|
||||||
if not len(scores):
|
if not len(scores) or scores[0][1] < threshold:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
Loading…
Reference in New Issue