Detect provided content with word count

This is instead of character count.
master
pictuga 2013-07-14 18:57:12 +02:00
parent 7fa183d713
commit 054f5c0846
1 changed files with 19 additions and 8 deletions

View File

@ -75,6 +75,12 @@ def lenHTML(txt):
else: else:
return 0 return 0
def countWord(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
def parseOptions(available): def parseOptions(available):
options = None options = None
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
@ -237,12 +243,17 @@ def Fill(item, cache, feedurl="/", fast=False):
item.title = item.title.title() item.title = item.title.title()
# content already provided? # content already provided?
if item.content and item.desc: count_content = countWord(item.content)
len_content = lenHTML(item.content) count_desc = countWord(item.desc)
len_desc = lenHTML(item.desc)
log('content: %s vs %s' % (len_content, len_desc)) log('desc: %s words, content: %s words' % (count_content, count_desc))
if len_content > 5*len_desc:
log('provided') if max(count_content, count_desc) > 500:
log('long enough')
return True
if count_content > 5*count_desc > 0 and count_content > 50:
log('content bigger enough')
return True return True
# check cache and previous errors # check cache and previous errors
@ -276,7 +287,7 @@ def Fill(item, cache, feedurl="/", fast=False):
data, url = ddl data, url = ddl
out = readability.Document(data, url=url).summary(True) out = readability.Document(data, url=url).summary(True)
if not item.desc or lenHTML(out) > lenHTML(item.desc): if countWord(out) > max(count_content, count_desc) > 0:
item.content = out item.content = out
cache.set(item.link, out) cache.set(item.link, out)
else: else: