crawler: return dict instead of tuple

feeds: remove mimetype from function call as no longer used
Centralize url clean up in crawler.py
2020-04-28 22:10:20 +02:00 · 2020-04-28 22:07:25 +02:00 · 2020-04-28 22:03:49 +02:00 · 2020-04-28 21:58:26 +02:00
5 changed files with 45 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -58,6 +58,10 @@ Simplest way to get these:
 pip install git+https://git.pictuga.com/pictuga/morss.git@master
 ```

+The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
+C code needs to be compiled). If possible on your distribution, try installing
+it with the system package manager.
+
 You may also need:

 - Apache, with python-cgi support, to run on a server
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -51,12 +51,15 @@ DEFAULT_UAS = [
    ]


+PROTOCOL = ['http', 'https']
+
+
 def get(*args, **kwargs):
-    return adv_get(*args, **kwargs)[0]
+    return adv_get(*args, **kwargs)['data']


 def adv_get(url, timeout=None, *args, **kwargs):
-    url = encode_url(url)
+    url = sanitize_url(url)

    if timeout is None:
        con = custom_handler(*args, **kwargs).open(url)
@@ -69,7 +72,13 @@ def adv_get(url, timeout=None, *args, **kwargs):
    contenttype = con.info().get('Content-Type', '').split(';')[0]
    encoding= detect_encoding(data, con)

-    return data, con, contenttype, encoding
+    return {
+        'data':data,
+        'url': con.geturl(),
+        'con': con,
+        'contenttype': contenttype,
+        'encoding': encoding
+    }


 def custom_handler(follow=None, delay=None, encoding=None):
@@ -113,8 +122,16 @@ def is_ascii(string):
        return True


-def encode_url(url):
-    " Escape non-ascii unicode characters "
+def sanitize_url(url):
+    if isinstance(url, bytes):
+        url = url.decode()
+
+    if url.split(':', 1)[0] not in PROTOCOL:
+        url = 'http://' + url
+
+    url = url.replace(' ', '%20')
+
+    # Escape non-ascii unicode characters
    # https://stackoverflow.com/a/4391299
    parts = list(urlparse(url))

@@ -613,4 +630,4 @@ if __name__ == '__main__':
    data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')

    if not sys.flags.interactive:
-        print(data.decode(encoding))
+        print(req['data'].decode(req['encoding']))
--- a/morss/feeds.py
+++ b/morss/feeds.py
@@ -71,7 +71,7 @@ def parse_rules(filename=None):
    return rules


-def parse(data, url=None, mimetype=None, encoding=None):
+def parse(data, url=None, encoding=None):
    " Determine which ruleset to use "

    rulesets = parse_rules()
@@ -759,8 +759,8 @@ class ItemJSON(Item, ParserJSON):
 if __name__ == '__main__':
    from . import crawler

-    data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
-    feed = parse(data, url=con.geturl(), mimetype=contenttype, encoding=encoding)
+    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss')
+    feed = parse(req['data'], url=req['url'], encoding=req['encoding'])

    if not sys.flags.interactive:
        for item in feed.items:
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -44,8 +44,6 @@ TIMEOUT = 4  # http timeout (in sec)
 DEBUG = False
 PORT = 8080

-PROTOCOL = ['http', 'https']
-

 def filterOptions(options):
    return options
@@ -250,17 +248,17 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
+        req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
        return False # let's just delete errors stuff when in cache mode

-    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
+    if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain':
        log('non-text page')
        return True

-    out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
+    out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')

    if out is not None:
        item.content = out
@@ -297,22 +295,6 @@ def ItemAfter(item, options):
    return item


-def UrlFix(url):
-    if url is None:
-        raise MorssException('No url provided')
-
-    if isinstance(url, bytes):
-        url = url.decode()
-
-    if urlparse(url).scheme not in PROTOCOL:
-        url = 'http://' + url
-        log(url)
-
-    url = url.replace(' ', '%20')
-
-    return url
-
-
 def FeedFetch(url, options):
    # fetch feed
    delay = DELAY
@@ -321,14 +303,14 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
+        req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')

    if options.items:
        # using custom rules
-        rss = feeds.FeedHTML(xml, encoding=encoding)
+        rss = feeds.FeedHTML(req['data'], encoding=req['encoding'])

        rss.rules['title'] = options.title              if options.title        else '//head/title'
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
@@ -348,13 +330,13 @@ def FeedFetch(url, options):

    else:
        try:
-            rss = feeds.parse(xml, url, contenttype, encoding=encoding)
+            rss = feeds.parse(req['data'], url=url, encoding=req['encoding'])
            rss = rss.convert(feeds.FeedXML)
                # contains all fields, otherwise much-needed data can be lost

        except TypeError:
            log('random page')
-            log(contenttype)
+            log(req['contenttype'])
            raise MorssException('Link provided is not a valid feed')

    return rss
@@ -456,7 +438,6 @@ def process(url, cache=None, options=None):
    if cache:
        crawler.default_cache = crawler.SQLiteCache(cache)

-    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)

@@ -529,7 +510,6 @@ def cgi_app(environ, start_response):
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))

    # get the work done
-    url = UrlFix(url)
    rss = FeedFetch(url, options)

    if headers['content-type'] == 'text/xml':
@@ -614,17 +594,12 @@ def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)

    # get page
-    PROTOCOL = ['http', 'https']
+    req = crawler.adv_get(url=url, timeout=TIMEOUT)

-    if urlparse(url).scheme not in ['http', 'https']:
-        url = 'http://' + url
-
-    data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
-
-    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
+    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
        if options.get == 'page':
-            html = readabilite.parse(data, encoding=encoding)
-            html.make_links_absolute(con.geturl())
+            html = readabilite.parse(req['data'], encoding=req['encoding'])
+            html.make_links_absolute(req['url'])

            kill_tags = ['script', 'iframe', 'noscript']

@@ -635,13 +610,13 @@ def cgi_get(environ, start_response):
            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')

        elif options.get == 'article':
-            output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
+            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)

        else:
            raise MorssException('no :get option passed')

    else:
-        output = data
+        output = req['data']

    # return html page
    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'}
@@ -698,7 +673,6 @@ def cli_app():

    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))

-    url = UrlFix(url)
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options, 'unicode')
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -348,8 +348,8 @@ if __name__ == '__main__':
    import sys
    from . import crawler

-    data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
-    article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
+    req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
+    article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode')

    if not sys.flags.interactive:
        print(article)
Author	SHA1	Message	Date
pictuga	7f4589c578	crawler: return dict instead of tuple	2020-04-28 22:10:20 +02:00
pictuga	a1dc96cb50	feeds: remove mimetype from function call as no longer used	2020-04-28 22:07:25 +02:00
pictuga	749acc87fc	Centralize url clean up in crawler.py	2020-04-28 22:03:49 +02:00
pictuga	c186188557	README: warning about lxml installation	2020-04-28 21:58:26 +02:00