Check url before looking for provided content

Also use lenHTML() function defined a lately
master
pictuga 2013-05-15 17:32:42 +02:00
parent 85e40cde4e
commit 8e2aab55e7
1 changed files with 10 additions and 8 deletions

View File

@ -324,14 +324,7 @@ def Fill(rss, cache, mode='feed'):
log('no link') log('no link')
return return
# content already provided? # feedburner and others
if 'content' in item and 'desc' in item:
content_len = len(lxml.html.fromstring(item.content).text_content())
log('content: %s vs %s' % (content_len, len(item.desc)))
if content_len > 5*len(item.desc):
log('provided')
return
if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item: if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink'] item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
log(item.link) log(item.link)
@ -347,6 +340,15 @@ def Fill(rss, cache, mode='feed'):
if urlparse.urlparse(item.link).netloc is '': if urlparse.urlparse(item.link).netloc is '':
item.link = urlparse.urljoin(feedurl, item.link) item.link = urlparse.urljoin(feedurl, item.link)
# content already provided?
if 'content' in item and 'desc' in item:
len_content = lenHTML(item.content)
len_desc = lenHTML(item.desc)
log('content: %s vs %s' % (len_content, len_desc))
if len_content > 5*len_desc:
log('provided')
return
# check cache and previous errors # check cache and previous errors
if item.link in cache: if item.link in cache:
content = cache.get(item.link) content = cache.get(item.link)