sheet.xsl: add <select/> to use :firstlink

morss: remove deprecated twitter/fb link handling
morss: ability to use first link from desc instead of default link
2020-05-13 12:33:12 +02:00 · 2020-05-13 12:31:09 +02:00 · 2020-05-13 12:29:53 +02:00
2 changed files with 16 additions and 30 deletions
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -130,7 +130,7 @@ def parseOptions(options):
    return out


-def ItemFix(item, feedurl='/'):
+def ItemFix(item, options, feedurl='/'):
    """ Improves feed items (absolute links, resolve feedburner links, etc) """

    # check unwanted uppercase title
@@ -149,6 +149,13 @@ def ItemFix(item, feedurl='/'):
            item.link = match[0]
            log(item.link)

+    # at user's election, use first <a>
+    if options.firstlink and (item.desc or item.content):
+        match = lxml.html.fromstring(item.desc or item.content).xpath('//a/@href')
+        if len(match):
+            item.link = match[0]
+            log(item.link)
+
    # check relative urls
    item.link = urljoin(feedurl, item.link)

@@ -210,36 +217,10 @@ def ItemFill(item, options, feedurl='/', fast=False):

    if not item.link:
        log('no link')
-        return item
+        return True

    log(item.link)

-    link = item.link
-
-    # twitter
-    if urlparse(feedurl).netloc == 'twitter.com':
-        match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url')
-        if len(match):
-            link = match[0]
-            log(link)
-
-        else:
-            link = None
-
-    # facebook
-    if urlparse(feedurl).netloc == 'graph.facebook.com':
-        match = lxml.html.fromstring(item.content).xpath('//a/@href')
-        if len(match) and urlparse(match[0]).netloc != 'www.facebook.com':
-            link = match[0]
-            log(link)
-
-        else:
-            link = None
-
-    if link is None:
-        log('no used link')
-        return True
-
    # download
    delay = -1

@@ -255,7 +236,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = 24*60*60 # 24h

    try:
-        req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
+        req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
@@ -375,7 +356,7 @@ def FeedGather(rss, url, options):
        if item is None:
            continue

-        item = ItemFix(item, url)
+        item = ItemFix(item, options, url)

        if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
            if not options.proxy:
--- a/www/sheet.xsl
+++ b/www/sheet.xsl
@@ -185,6 +185,11 @@
 						<option value=":html">HTML</option>
 						<option value=":csv">CSV</option>
 					</select>
+					using 
+					<select>
+						<option value="">the standard link</option>
+						<option value=":firstlink" title="Useful for Twitter feeds for example, to get the articles referred to in tweets rather than the tweet itself">the first link from the description (?)</option>
+					</select>
 					and 
 					<select>
 						<option value="">keep</option>
Author	SHA1	Message	Date
pictuga	ad3ba9de1a	sheet.xsl: add <select/> to use :firstlink	2020-05-13 12:33:12 +02:00
pictuga	68c46a1823	morss: remove deprecated twitter/fb link handling	2020-05-13 12:31:09 +02:00
pictuga	91be2d229e	morss: ability to use first link from desc instead of default link	2020-05-13 12:29:53 +02:00