From a5aec8c7a6d6eda220f504817994387ff8bfcec7 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 25 Feb 2017 18:13:15 -1000 Subject: [PATCH] readability: more keywords to the filter list Also fixed indentation --- morss/readabilite.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/morss/readabilite.py b/morss/readabilite.py index 506214a..5f8eab5 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -31,16 +31,17 @@ def count_words(string): return count -regex_bad = re.compile('|'.join(['combx', 'comment', 'community', 'disqus', - 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', - 'sponsor', 'ad-break', 'agegate', 'pagination', 'pager', 'popup', 'tweet', - 'twitter', 'com-', 'contact', 'footnote', 'masthead', 'media', 'meta', - 'outbrain', 'promo', 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', - 'shopping', 'tags', 'tool', 'widget']), re.I) +regex_bad = re.compile('|'.join(['robots-nocontent', 'combx', 'comment', + 'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss', + 'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination', + 'pager', 'popup', 'tweet', 'twitter', 'com-', 'sharing', 'share', 'social', + 'contact', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo', + 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags', + 'tool', 'widget']), re.I) regex_good = re.compile('|'.join(['and', 'article', 'body', 'column', - 'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page', - 'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I) + 'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page', + 'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I) tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']