From bfefa8d599a53b0b77a78b223f3cdc1e9344903f Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 24 Mar 2017 21:50:26 -1000 Subject: [PATCH] readabilite: add tags to black list --- morss/readabilite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index ef6c8f2..e05f68b 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -40,14 +40,14 @@ regex_bad = re.compile('|'.join(['robots-nocontent', 'combx', 'comment', 'shoutbox', 'sidebar', 'sponsor', 'ad-', 'agegate', 'pagination', 'pager', 'popup', 'tweet', 'twitter', 'com-', 'sharing', 'share', 'social', 'contact', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo', - 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags', + 'related', 'scroll', 'shoutbox', 'shopping', 'tags', 'tool', 'widget', 'hide']), re.I) regex_good = re.compile('|'.join(['and', 'article', 'body', 'column', 'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page', 'pagination', 'post', 'text', 'blog', 'story', 'par', 'editorial']), re.I) -tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea'] +tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', 'button'] attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']