split morss.py into __main__/cgi/cli.py

Should hopefully allow cleaner code in the future
2020-08-21 22:17:55 +02:00
parent c6d3a0eb53
commit c6b52e625f
6 changed files with 341 additions and 307 deletions
--- a/main.py
+++ b/main.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
-from morss import main, cgi_standalone_app as application
+from morss.__main__ import main
 from morss.cgi import application
 if __name__ == '__main__':
    main()
--- a/morss/init.py
+++ b/morss/init.py
@@ -1,2 +1,3 @@
 # ran on `import morss`
 from .morss import *
 from .cgi import application
--- a/morss/main.py
+++ b/morss/main.py
@@ -1,5 +1,74 @@
 # ran on `python -m morss`
-from .morss import main
+
 import os
 import sys
 from . import cgi
 from . import cli
 from .morss import MorssException
 import wsgiref.simple_server
 import wsgiref.handlers
 PORT = 8080
 def isInt(string):
    try:
        int(string)
        return True
    except ValueError:
        return False
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi
        app = cgi.cgi_app
        app = cgi.cgi_dispatcher(app)
        app = cgi.cgi_error_handler(app)
        app = cgi.cgi_encode(app)
        wsgiref.handlers.CGIHandler().run(app)
    elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
        # start internal (basic) http server
        if len(sys.argv) > 1 and isInt(sys.argv[1]):
            argPort = int(sys.argv[1])
            if argPort > 0:
                port = argPort
            else:
                raise MorssException('Port must be positive integer')
        else:
            port = PORT
        app = cgi.cgi_app
        app = cgi.cgi_file_handler(app)
        app = cgi.cgi_dispatcher(app)
        app = cgi.cgi_error_handler(app)
        app = cgi.cgi_encode(app)
        print('Serving http://localhost:%s/' % port)
        httpd = wsgiref.simple_server.make_server('', port, app)
        httpd.serve_forever()
    else:
        # as a CLI app
        try:
            cli.cli_app()
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print('ERROR: %s' % e.message)
 if __name__ == '__main__':
    main()
--- a/morss/cgi.py
+++ b/morss/cgi.py
@@ -0,0 +1,242 @@
 import sys
 import os.path
 import re
 import cgitb
 try:
    # python 2
    from urllib import unquote
 except ImportError:
    # python 3
    from urllib.parse import unquote
 from . import crawler
 from . import readabilite
 from .morss import FeedFetch, FeedGather, FeedFormat
 from .morss import Options, filterOptions, parseOptions
 from .morss import log, DELAY, DEBUG, MorssException
 from . import cred
 def cgi_parse_environ(environ):
    # get options
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
        if environ['QUERY_STRING']:
            url += '?' + environ['QUERY_STRING']
    url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
    if url.startswith(':'):
        split = url.split('/', 1)
        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
        if len(split) > 1:
            url = split[1]
        else:
            url = ''
    else:
        raw_options = []
    # init
    options = Options(filterOptions(parseOptions(raw_options)))
    global DEBUG
    DEBUG = options.debug
    return (url, options)
 def cgi_app(environ, start_response):
    url, options = cgi_parse_environ(environ)
    headers = {}
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
    headers['x-content-type-options'] = 'nosniff' # safari work around
    if options.cors:
        headers['access-control-allow-origin'] = '*'
    if options.html:
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
    elif options.json:
        headers['content-type'] = 'application/json'
    elif options.callback:
        headers['content-type'] = 'application/javascript'
    elif options.csv:
        headers['content-type'] = 'text/csv'
        headers['content-disposition'] = 'attachment; filename="feed.csv"'
    else:
        headers['content-type'] = 'text/xml'
    headers['content-type'] += '; charset=utf-8'
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
    # get the work done
    url, rss = FeedFetch(url, options)
    start_response(headers['status'], list(headers.items()))
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
    if options.silent:
        return ['']
    else:
        return [out]
 def middleware(func):
    " Decorator to turn a function into a wsgi middleware "
    # This is called when parsing the "@middleware" code
    def app_builder(app):
        # This is called when doing app = cgi_wrapper(app)
        def app_wrap(environ, start_response):
            # This is called when a http request is being processed
            return func(environ, start_response, app)
        return app_wrap
    return app_builder
@middleware
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "
    files = {
        '': 'text/html',
        'index.html': 'text/html',
        'sheet.xsl': 'text/xsl'}
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
    if url in files:
        headers = {}
        if url == '':
            url = 'index.html'
        paths = [os.path.join(sys.prefix, 'share/morss/www', url),
            os.path.join(os.path.dirname(__file__), '../www', url)]
        for path in paths:
            try:
                body = open(path, 'rb').read()
                headers['status'] = '200 OK'
                headers['content-type'] = files[url]
                start_response(headers['status'], list(headers.items()))
                return [body]
            except IOError:
                continue
        else:
            # the for loop did not return, so here we are, i.e. no file found
            headers['status'] = '404 Not found'
            start_response(headers['status'], list(headers.items()))
            return ['Error %s' % headers['status']]
    else:
        return app(environ, start_response)
 def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)
    # get page
    req = crawler.adv_get(url=url, timeout=TIMEOUT)
    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
        if options.get == 'page':
            html = readabilite.parse(req['data'], encoding=req['encoding'])
            html.make_links_absolute(req['url'])
            kill_tags = ['script', 'iframe', 'noscript']
            for tag in kill_tags:
                for elem in html.xpath('//'+tag):
                    elem.getparent().remove(elem)
            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
        elif options.get == 'article':
            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
        else:
            raise MorssException('no :get option passed')
    else:
        output = req['data']
    # return html page
    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]
 dispatch_table = {
    'get': cgi_get,
    }
@middleware
 def cgi_dispatcher(environ, start_response, app):
    url, options = cgi_parse_environ(environ)
    for key in dispatch_table.keys():
        if key in options:
            return dispatch_table[key](environ, start_response)
    return app(environ, start_response)
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
        return app(environ, start_response)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
        log('ERROR: %s' % repr(e), force=True)
        return [cgitb.html(sys.exc_info())]
@middleware
 def cgi_encode(environ, start_response, app):
    out = app(environ, start_response)
    return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
 application = cgi_app
 application = cgi_file_handler(application)
 application = cgi_dispatcher(application)
 application = cgi_error_handler(application)
 application = cgi_encode(application)
--- a/morss/cli.py
+++ b/morss/cli.py
@@ -0,0 +1,26 @@
 import sys
 import os.path
 from . import crawler
 from .morss import FeedFetch, FeedGather, FeedFormat
 from .morss import Options, filterOptions, parseOptions
 from .morss import log, DEBUG
 def cli_app():
    options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
    url = sys.argv[-1]
    global DEBUG
    DEBUG = options.debug
    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options, 'unicode')
    if not options.silent:
        print(out)
    log('done')
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -1,6 +1,4 @@
 import sys
 import os
 import os.path
 import time
 from datetime import datetime
@@ -16,20 +14,14 @@ from . import feeds
 from . import crawler
 from . import readabilite
 import wsgiref.simple_server
 import wsgiref.handlers
 import cgitb
 try:
    # python 2
    from httplib import HTTPException
    from urllib import unquote
    from urlparse import urlparse, urljoin, parse_qs
 except ImportError:
    # python 3
    from http.client import HTTPException
    from urllib.parse import unquote
    from urllib.parse import urlparse, urljoin, parse_qs
 MAX_ITEM = 5  # cache-only beyond
@@ -42,7 +34,6 @@ DELAY = 10 * 60  # xml cache & ETag cache (in sec)
 TIMEOUT = 4  # http timeout (in sec)
 DEBUG = False
 PORT = 8080
 def filterOptions(options):
@@ -437,299 +428,3 @@ def process(url, cache=None, options=None):
    rss = FeedGather(rss, url, options)
    return FeedFormat(rss, options, 'unicode')
 def cgi_parse_environ(environ):
    # get options
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
        if environ['QUERY_STRING']:
            url += '?' + environ['QUERY_STRING']
    url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
    if url.startswith(':'):
        split = url.split('/', 1)
        raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
        if len(split) > 1:
            url = split[1]
        else:
            url = ''
    else:
        raw_options = []
    # init
    options = Options(filterOptions(parseOptions(raw_options)))
    global DEBUG
    DEBUG = options.debug
    return (url, options)
 def cgi_app(environ, start_response):
    url, options = cgi_parse_environ(environ)
    headers = {}
    # headers
    headers['status'] = '200 OK'
    headers['cache-control'] = 'max-age=%s' % DELAY
    headers['x-content-type-options'] = 'nosniff' # safari work around
    if options.cors:
        headers['access-control-allow-origin'] = '*'
    if options.html:
        headers['content-type'] = 'text/html'
    elif options.txt or options.silent:
        headers['content-type'] = 'text/plain'
    elif options.json:
        headers['content-type'] = 'application/json'
    elif options.callback:
        headers['content-type'] = 'application/javascript'
    elif options.csv:
        headers['content-type'] = 'text/csv'
        headers['content-disposition'] = 'attachment; filename="feed.csv"'
    else:
        headers['content-type'] = 'text/xml'
    headers['content-type'] += '; charset=utf-8'
    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
    # get the work done
    url, rss = FeedFetch(url, options)
    start_response(headers['status'], list(headers.items()))
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options)
    if options.silent:
        return ['']
    else:
        return [out]
 def middleware(func):
    " Decorator to turn a function into a wsgi middleware "
    # This is called when parsing the "@middleware" code
    def app_builder(app):
        # This is called when doing app = cgi_wrapper(app)
        def app_wrap(environ, start_response):
            # This is called when a http request is being processed
            return func(environ, start_response, app)
        return app_wrap
    return app_builder
@middleware
 def cgi_file_handler(environ, start_response, app):
    " Simple HTTP server to serve static files (.html, .css, etc.) "
    files = {
        '': 'text/html',
        'index.html': 'text/html',
        'sheet.xsl': 'text/xsl'}
    if 'REQUEST_URI' in environ:
        url = environ['REQUEST_URI'][1:]
    else:
        url = environ['PATH_INFO'][1:]
    if url in files:
        headers = {}
        if url == '':
            url = 'index.html'
        paths = [os.path.join(sys.prefix, 'share/morss/www', url),
            os.path.join(os.path.dirname(__file__), '../www', url)]
        for path in paths:
            try:
                body = open(path, 'rb').read()
                headers['status'] = '200 OK'
                headers['content-type'] = files[url]
                start_response(headers['status'], list(headers.items()))
                return [body]
            except IOError:
                continue
        else:
            # the for loop did not return, so here we are, i.e. no file found
            headers['status'] = '404 Not found'
            start_response(headers['status'], list(headers.items()))
            return ['Error %s' % headers['status']]
    else:
        return app(environ, start_response)
 def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)
    # get page
    req = crawler.adv_get(url=url, timeout=TIMEOUT)
    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
        if options.get == 'page':
            html = readabilite.parse(req['data'], encoding=req['encoding'])
            html.make_links_absolute(req['url'])
            kill_tags = ['script', 'iframe', 'noscript']
            for tag in kill_tags:
                for elem in html.xpath('//'+tag):
                    elem.getparent().remove(elem)
            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
        elif options.get == 'article':
            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
        else:
            raise MorssException('no :get option passed')
    else:
        output = req['data']
    # return html page
    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]
 dispatch_table = {
    'get': cgi_get,
    }
@middleware
 def cgi_dispatcher(environ, start_response, app):
    url, options = cgi_parse_environ(environ)
    for key in dispatch_table.keys():
        if key in options:
            return dispatch_table[key](environ, start_response)
    return app(environ, start_response)
@middleware
 def cgi_error_handler(environ, start_response, app):
    try:
        return app(environ, start_response)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
        log('ERROR: %s' % repr(e), force=True)
        return [cgitb.html(sys.exc_info())]
@middleware
 def cgi_encode(environ, start_response, app):
    out = app(environ, start_response)
    return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
 cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app))))
 def cli_app():
    options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
    url = sys.argv[-1]
    global DEBUG
    DEBUG = options.debug
    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
    url, rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
    out = FeedFormat(rss, options, 'unicode')
    if not options.silent:
        print(out)
    log('done')
 def isInt(string):
    try:
        int(string)
        return True
    except ValueError:
        return False
 def main():
    if 'REQUEST_URI' in os.environ:
        # mod_cgi
        app = cgi_app
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        wsgiref.handlers.CGIHandler().run(app)
    elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
        # start internal (basic) http server
        if len(sys.argv) > 1 and isInt(sys.argv[1]):
            argPort = int(sys.argv[1])
            if argPort > 0:
                port = argPort
            else:
                raise MorssException('Port must be positive integer')
        else:
            port = PORT
        app = cgi_app
        app = cgi_file_handler(app)
        app = cgi_dispatcher(app)
        app = cgi_error_handler(app)
        app = cgi_encode(app)
        print('Serving http://localhost:%s/' % port)
        httpd = wsgiref.simple_server.make_server('', port, app)
        httpd.serve_forever()
    else:
        # as a CLI app
        try:
            cli_app()
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            print('ERROR: %s' % e.message)
 if __name__ == '__main__':
    main()