Compare commits

...

3 Commits

2 changed files with 16 additions and 30 deletions

View File

@ -130,7 +130,7 @@ def parseOptions(options):
return out
def ItemFix(item, feedurl='/'):
def ItemFix(item, options, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
@ -149,6 +149,13 @@ def ItemFix(item, feedurl='/'):
item.link = match[0]
log(item.link)
# at user's election, use first <a>
if options.firstlink and (item.desc or item.content):
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
if len(match):
item.link = match[0]
log(item.link)
# check relative urls
item.link = urljoin(feedurl, item.link)
@ -210,36 +217,10 @@ def ItemFill(item, options, feedurl='/', fast=False):
if not item.link:
log('no link')
return item
return True
log(item.link)
link = item.link
# twitter
if urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# download
delay = -1
@ -255,7 +236,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = 24*60*60 # 24h
try:
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e:
log('http error')
@ -375,7 +356,7 @@ def FeedGather(rss, url, options):
if item is None:
continue
item = ItemFix(item, url)
item = ItemFix(item, options, url)
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy:

View File

@ -185,6 +185,11 @@
<option value=":html">HTML</option>
<option value=":csv">CSV</option>
</select>
using
<select>
<option value="">the standard link</option>
<option value=":firstlink" title="Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">the first link from the description (?)</option>
</select>
and
<select>
<option value="">keep</option>