Compare commits
No commits in common. "ad3ba9de1a3e33b91dae4fd59cb805118428f41c" and "038f267ea2b0fc1c1a97ff72cc7e9a8fd857b16f" have entirely different histories.
ad3ba9de1a
...
038f267ea2
|
@ -130,7 +130,7 @@ def parseOptions(options):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def ItemFix(item, options, feedurl='/'):
|
def ItemFix(item, feedurl='/'):
|
||||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||||
|
|
||||||
# check unwanted uppercase title
|
# check unwanted uppercase title
|
||||||
|
@ -149,13 +149,6 @@ def ItemFix(item, options, feedurl='/'):
|
||||||
item.link = match[0]
|
item.link = match[0]
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# at user's election, use first <a>
|
|
||||||
if options.firstlink and (item.desc or item.content):
|
|
||||||
match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
|
|
||||||
if len(match):
|
|
||||||
item.link = match[0]
|
|
||||||
log(item.link)
|
|
||||||
|
|
||||||
# check relative urls
|
# check relative urls
|
||||||
item.link = urljoin(feedurl, item.link)
|
item.link = urljoin(feedurl, item.link)
|
||||||
|
|
||||||
|
@ -217,10 +210,36 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
|
|
||||||
if not item.link:
|
if not item.link:
|
||||||
log('no link')
|
log('no link')
|
||||||
return True
|
return item
|
||||||
|
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
|
link = item.link
|
||||||
|
|
||||||
|
# twitter
|
||||||
|
if urlparse(feedurl).netloc == 'twitter.com':
|
||||||
|
match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
|
||||||
|
if len(match):
|
||||||
|
link = match[0]
|
||||||
|
log(link)
|
||||||
|
|
||||||
|
else:
|
||||||
|
link = None
|
||||||
|
|
||||||
|
# facebook
|
||||||
|
if urlparse(feedurl).netloc == 'graph.facebook.com':
|
||||||
|
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
||||||
|
if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
|
||||||
|
link = match[0]
|
||||||
|
log(link)
|
||||||
|
|
||||||
|
else:
|
||||||
|
link = None
|
||||||
|
|
||||||
|
if link is None:
|
||||||
|
log('no used link')
|
||||||
|
return True
|
||||||
|
|
||||||
# download
|
# download
|
||||||
delay = -1
|
delay = -1
|
||||||
|
|
||||||
|
@ -236,7 +255,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = 24*60*60 # 24h
|
delay = 24*60*60 # 24h
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
|
req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
log('http error')
|
log('http error')
|
||||||
|
@ -356,7 +375,7 @@ def FeedGather(rss, url, options):
|
||||||
if item is None:
|
if item is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
item = ItemFix(item, options, url)
|
item = ItemFix(item, url)
|
||||||
|
|
||||||
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
|
||||||
if not options.proxy:
|
if not options.proxy:
|
||||||
|
|
|
@ -185,11 +185,6 @@
|
||||||
<option value=":html">HTML</option>
|
<option value=":html">HTML</option>
|
||||||
<option value=":csv">CSV</option>
|
<option value=":csv">CSV</option>
|
||||||
</select>
|
</select>
|
||||||
using
|
|
||||||
<select>
|
|
||||||
<option value="">the standard link</option>
|
|
||||||
<option value=":firstlink" title="Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">the first link from the description (?)</option>
|
|
||||||
</select>
|
|
||||||
and
|
and
|
||||||
<select>
|
<select>
|
||||||
<option value="">keep</option>
|
<option value="">keep</option>
|
||||||
|
|
Loading…
Reference in New Issue