From 8e2aab55e783f7f03a0c3c3cb166011ed5f7bb90 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 15 May 2013 17:32:42 +0200 Subject: [PATCH] Check url before looking for provided content Also use lenHTML() function defined a lately --- morss.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/morss.py b/morss.py index d174644..b3ac9e7 100644 --- a/morss.py +++ b/morss.py @@ -324,14 +324,7 @@ def Fill(rss, cache, mode='feed'): log('no link') return - # content already provided? - if 'content' in item and 'desc' in item: - content_len = len(lxml.html.fromstring(item.content).text_content()) - log('content: %s vs %s' % (content_len, len(item.desc))) - if content_len > 5*len(item.desc): - log('provided') - return - + # feedburner and others if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item: item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink'] log(item.link) @@ -347,6 +340,15 @@ def Fill(rss, cache, mode='feed'): if urlparse.urlparse(item.link).netloc is '': item.link = urlparse.urljoin(feedurl, item.link) + # content already provided? + if 'content' in item and 'desc' in item: + len_content = lenHTML(item.content) + len_desc = lenHTML(item.desc) + log('content: %s vs %s' % (len_content, len_desc)) + if len_content > 5*len_desc: + log('provided') + return + # check cache and previous errors if item.link in cache: content = cache.get(item.link)