Check article length is big enough

Avoids replacing rather useful descriptions with empty string
master
pictuga 2013-05-15 17:24:27 +02:00
parent 222b1369e5
commit 85e40cde4e
1 changed files with 15 additions and 6 deletions

View File

@ -67,6 +67,9 @@ def cleanXML(xml):
table = string.maketrans('', '') table = string.maketrans('', '')
return xml.translate(table, table[:32]).lstrip() return xml.translate(table, table[:32]).lstrip()
def lenHTML(txt):
return len(lxml.html.fromstring(txt).text_content())
def parseOptions(available): def parseOptions(available):
options = None options = None
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
@ -347,12 +350,13 @@ def Fill(rss, cache, mode='feed'):
# check cache and previous errors # check cache and previous errors
if item.link in cache: if item.link in cache:
content = cache.get(item.link) content = cache.get(item.link)
if content == 'httperr': match = re.search(r'^error-([a-z]{2,10})$', content)
if match:
if cache.isYoungerThan(DELAY*60): if cache.isYoungerThan(DELAY*60):
log('cached http err') log('cached error: %s' % match.groups()[0])
return return
else: else:
log('old http error') log('old error')
else: else:
log('cached') log('cached')
item.content = cache.get(item.link) item.content = cache.get(item.link)
@ -367,16 +371,21 @@ def Fill(rss, cache, mode='feed'):
if ddl is False: if ddl is False:
log('http error') log('http error')
cache.set(item.link, 'httperr') cache.set(item.link, 'error-http')
return return
data, enc, url = ddl data, enc, url = ddl
log(enc) log(enc)
out = readability.Document(data.decode(enc, 'ignore'), url=url).summary(True) out = readability.Document(data.decode(enc, 'ignore'), url=url).summary(True)
if 'desc' not in item or lenHTML(out) > lenHTML(item.desc):
item.content = out
cache.set(item.link, out)
else:
log('not bigger enough')
cache.set(item.link, 'error-length')
return
item.content = out
cache.set(item.link, out)
def Gather(url, cachePath, mode='feed'): def Gather(url, cachePath, mode='feed'):
cache = Cache(cachePath, url) cache = Cache(cachePath, url)