readabilite: drop dangerous tags (script, style)

readabilite: minimum score requirement
feeds: fix bug when deleting attr in html
2020-04-25 12:25:02 +02:00 · 2020-04-25 12:24:36 +02:00 · 2020-04-24 22:12:05 +02:00
2 changed files with 16 additions and 6 deletions
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -401,7 +401,8 @@ class ParserXML(ParserBase):
            return
        elif key is not None:
-            del x.attrib[key]
+            if key in match.attrib:
                del match.attrib[key]
        else:
            match.getparent().remove(match)
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -70,9 +70,10 @@ class_good = ['and', 'article', 'body', 'column', 'main',
 regex_good = re.compile('|'.join(class_good), re.I)
-tags_junk = ['script', 'head', 'iframe', 'object', 'noscript',
+tags_dangerous = ['script', 'head', 'iframe', 'object', 'style', 'link', 'meta']
-    'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea',
+
-    'button', 'footer', 'link', 'meta']
+tags_junk = tags_dangerous + ['noscript', 'param', 'embed', 'layer', 'applet',
    'form', 'input', 'textarea', 'button', 'footer']
 tags_bad = tags_junk + ['a', 'aside']
@@ -106,6 +107,9 @@ def score_node(node):
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
        return 0
    if node.tag in tags_dangerous:
        return 0
    if node.tag in tags_junk:
        score += -1 # actuall -2 as tags_junk is included tags_bad
@@ -189,6 +193,11 @@ def clean_node(node, keep_threshold=None):
        # this is <html/> (or a removed element waiting for GC)
        return
    # remove dangerous tags, no matter what
    if node.tag in tags_dangerous:
        parent.remove(node)
        return
    if keep_threshold is not None and get_score(node) >= keep_threshold:
        # high score, so keep
        return
@@ -307,14 +316,14 @@ def get_best_node(ranked_grades):
    return lowest
-def get_article(data, url=None, encoding=None, debug=False):
+def get_article(data, url=None, encoding=None, debug=False, threshold=5):
    " Input a raw html string, returns a raw html string of the article "
    html = parse(data, encoding)
    score_all(html)
    scores = rank_grades(get_all_scores(html))
-    if not len(scores):
+    if not len(scores) or scores[0][1] < threshold:
        return None
    best = get_best_node(scores)
Author	SHA1	Message	Date
pictuga	f6bc23927f	readabilite: drop dangerous tags (script, style)	2020-04-25 12:25:02 +02:00
pictuga	c86572374e	readabilite: minimum score requirement	2020-04-25 12:24:36 +02:00
pictuga	59ef5af9e2	feeds: fix bug when deleting attr in html	2020-04-24 22:12:05 +02:00