Test if linked pages are text documents

Useful for feeds such as HackerNews
master
pictuga 2013-09-10 15:25:55 +02:00
parent 1b7fdad6a8
commit 3ba74649f6
1 changed files with 5 additions and 0 deletions

View File

@ -351,6 +351,11 @@ def Fill(item, cache, feedurl='/', fast=False):
cache.set(item.link, 'error-http')
return True
if con.info().maintype != 'text':
log('non-text page')
cache.set(item.link, 'error-type')
return True
out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0: