From 85e40cde4e6ebb6c1f7a80997da93c720a68e36e Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 15 May 2013 17:24:27 +0200 Subject: [PATCH] Check article length is big enough Avoids replacing rather useful descriptions with empty string --- morss.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/morss.py b/morss.py index 0d5198c..d174644 100644 --- a/morss.py +++ b/morss.py @@ -67,6 +67,9 @@ def cleanXML(xml): table = string.maketrans('', '') return xml.translate(table, table[:32]).lstrip() +def lenHTML(txt): + return len(lxml.html.fromstring(txt).text_content()) + def parseOptions(available): options = None if 'REQUEST_URI' in os.environ: @@ -347,12 +350,13 @@ def Fill(rss, cache, mode='feed'): # check cache and previous errors if item.link in cache: content = cache.get(item.link) - if content == 'httperr': + match = re.search(r'^error-([a-z]{2,10})$', content) + if match: if cache.isYoungerThan(DELAY*60): - log('cached http err') + log('cached error: %s' % match.groups()[0]) return else: - log('old http error') + log('old error') else: log('cached') item.content = cache.get(item.link) @@ -367,16 +371,21 @@ def Fill(rss, cache, mode='feed'): if ddl is False: log('http error') - cache.set(item.link, 'httperr') + cache.set(item.link, 'error-http') return data, enc, url = ddl log(enc) out = readability.Document(data.decode(enc, 'ignore'), url=url).summary(True) + if 'desc' not in item or lenHTML(out) > lenHTML(item.desc): + item.content = out + cache.set(item.link, out) + else: + log('not bigger enough') + cache.set(item.link, 'error-length') + return - item.content = out - cache.set(item.link, out) def Gather(url, cachePath, mode='feed'): cache = Cache(cachePath, url)