Remove "clip" from Fill

Put that in Gather. Also removed from feeds.py. "alone" mode was also added (it removes the description).
master
pictuga 2013-10-01 19:45:54 +02:00
parent 1b7fe8fbee
commit 78706952fe
2 changed files with 15 additions and 10 deletions

View File

@ -343,14 +343,11 @@ class FeedItem(FeedBase):
description = desc = FeedDescriptor('desc') description = desc = FeedDescriptor('desc')
content = FeedDescriptor('content') content = FeedDescriptor('content')
def pushContent(self, value, clip=False): def pushContent(self, value):
if not self.desc and self.content: if not self.desc and self.content:
self.desc = self.content self.desc = self.content
if self.desc and clip: self.content = value
self.content = self.desc + "<br/><br/>* * *<br/><br/>" + value
else:
self.content = value
def remove(self): def remove(self):
self.xml.getparent().remove(self.xml) self.xml.getparent().remove(self.xml)

View File

@ -252,7 +252,7 @@ def decodeHTML(data, con=None):
log(enc) log(enc)
return data.decode(enc, 'replace') return data.decode(enc, 'replace')
def Fill(item, cache, feedurl='/', fast=False, clip=False): def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """ """ Returns True when it has done its best """
if not item.link: if not item.link:
@ -309,7 +309,6 @@ def Fill(item, cache, feedurl='/', fast=False, clip=False):
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url') match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
if len(match): if len(match):
link = match[0] link = match[0]
clip = True
log(link) log(link)
else: else:
link = None link = None
@ -330,7 +329,7 @@ def Fill(item, cache, feedurl='/', fast=False, clip=False):
log('old error') log('old error')
else: else:
log('cached') log('cached')
item.pushContent(cache.get(link), clip) item.pushContent(cache.get(link))
return True return True
# super-fast mode # super-fast mode
@ -356,7 +355,7 @@ def Fill(item, cache, feedurl='/', fast=False, clip=False):
out = readability.Document(data, url=con.url).summary(True) out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0: if countWord(out) > max(count_content, count_desc) > 0:
item.pushContent(out, clip) item.pushContent(out)
cache.set(link, out) cache.set(link, out)
else: else:
log('not bigger enough') log('not bigger enough')
@ -435,11 +434,20 @@ def Gather(url, cachePath, options):
if i+1 > LIM_ITEM > 0: if i+1 > LIM_ITEM > 0:
item.remove() item.remove()
continue
elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0: elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0:
if Fill(item, cache, url, True) is False: if Fill(item, cache, url, True) is False:
item.remove() item.remove()
continue
else: else:
Fill(item, cache, url, clip='clip' in options) Fill(item, cache, url)
if item.desc and item.content:
if 'clip' in options:
item.content = item.desc + "<br/><br/>* * *<br/><br/>" + item.content
del item.desc
if 'alone' in options:
del item.desc
log(len(rss.items)) log(len(rss.items))
log(time.time() - startTime) log(time.time() - startTime)