diff --git a/morss/readabilite.py b/morss/readabilite.py index 50f64ec..007fa5c 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -38,6 +38,11 @@ def count_words(string): return count +def count_content(node): + # count words and imgs + return count_words(node.text_content()) + len(node.findall('.//img')) + + regex_bad = re.compile('|'.join(['comment', 'community', 'extra', 'foot', 'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead', 'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about']),