parent
1d6d0b8ff1
commit
72d03f21fe
|
@ -38,6 +38,11 @@ def count_words(string):
|
||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def count_content(node):
|
||||||
|
# count words and imgs
|
||||||
|
return count_words(node.text_content()) + len(node.findall('.//img'))
|
||||||
|
|
||||||
|
|
||||||
regex_bad = re.compile('|'.join(['comment', 'community', 'extra', 'foot',
|
regex_bad = re.compile('|'.join(['comment', 'community', 'extra', 'foot',
|
||||||
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
|
'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead',
|
||||||
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about']),
|
'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about']),
|
||||||
|
|
Loading…
Reference in New Issue