From c6b52e625f39335090961e8e43bf65408c40e523 Mon Sep 17 00:00:00 2001 From: pictuga Date: Fri, 21 Aug 2020 22:17:55 +0200 Subject: [PATCH] split morss.py into __main__/cgi/cli.py Should hopefully allow cleaner code in the future --- main.py | 3 +- morss/__init__.py | 1 + morss/__main__.py | 71 ++++++++++- morss/cgi.py | 242 ++++++++++++++++++++++++++++++++++++ morss/cli.py | 26 ++++ morss/morss.py | 305 ---------------------------------------------- 6 files changed, 341 insertions(+), 307 deletions(-) create mode 100644 morss/cgi.py create mode 100644 morss/cli.py diff --git a/main.py b/main.py index e891255..3fa0712 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python -from morss import main, cgi_standalone_app as application +from morss.__main__ import main +from morss.cgi import application if __name__ == '__main__': main() diff --git a/morss/__init__.py b/morss/__init__.py index 7bd4c79..b1f7595 100644 --- a/morss/__init__.py +++ b/morss/__init__.py @@ -1,2 +1,3 @@ # ran on `import morss` from .morss import * +from .cgi import application diff --git a/morss/__main__.py b/morss/__main__.py index a71ee90..8287da1 100644 --- a/morss/__main__.py +++ b/morss/__main__.py @@ -1,5 +1,74 @@ # ran on `python -m morss` -from .morss import main + +import os +import sys + +from . import cgi +from . import cli + +from .morss import MorssException + +import wsgiref.simple_server +import wsgiref.handlers + + +PORT = 8080 + + +def isInt(string): + try: + int(string) + return True + + except ValueError: + return False + + +def main(): + if 'REQUEST_URI' in os.environ: + # mod_cgi + + app = cgi.cgi_app + app = cgi.cgi_dispatcher(app) + app = cgi.cgi_error_handler(app) + app = cgi.cgi_encode(app) + + wsgiref.handlers.CGIHandler().run(app) + + elif len(sys.argv) <= 1 or isInt(sys.argv[1]): + # start internal (basic) http server + + if len(sys.argv) > 1 and isInt(sys.argv[1]): + argPort = int(sys.argv[1]) + if argPort > 0: + port = argPort + + else: + raise MorssException('Port must be positive integer') + + else: + port = PORT + + app = cgi.cgi_app + app = cgi.cgi_file_handler(app) + app = cgi.cgi_dispatcher(app) + app = cgi.cgi_error_handler(app) + app = cgi.cgi_encode(app) + + print('Serving http://localhost:%s/' % port) + httpd = wsgiref.simple_server.make_server('', port, app) + httpd.serve_forever() + + else: + # as a CLI app + try: + cli.cli_app() + + except (KeyboardInterrupt, SystemExit): + raise + + except Exception as e: + print('ERROR: %s' % e.message) if __name__ == '__main__': main() diff --git a/morss/cgi.py b/morss/cgi.py new file mode 100644 index 0000000..4e1accc --- /dev/null +++ b/morss/cgi.py @@ -0,0 +1,242 @@ +import sys +import os.path +import re + +import cgitb + +try: + # python 2 + from urllib import unquote +except ImportError: + # python 3 + from urllib.parse import unquote + +from . import crawler +from . import readabilite +from .morss import FeedFetch, FeedGather, FeedFormat +from .morss import Options, filterOptions, parseOptions +from .morss import log, DELAY, DEBUG, MorssException + +from . import cred + + +def cgi_parse_environ(environ): + # get options + + if 'REQUEST_URI' in environ: + url = environ['REQUEST_URI'][1:] + else: + url = environ['PATH_INFO'][1:] + + if environ['QUERY_STRING']: + url += '?' + environ['QUERY_STRING'] + + url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url) + + if url.startswith(':'): + split = url.split('/', 1) + + raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:] + + if len(split) > 1: + url = split[1] + else: + url = '' + + else: + raw_options = [] + + # init + options = Options(filterOptions(parseOptions(raw_options))) + + global DEBUG + DEBUG = options.debug + + return (url, options) + + +def cgi_app(environ, start_response): + url, options = cgi_parse_environ(environ) + + headers = {} + + # headers + headers['status'] = '200 OK' + headers['cache-control'] = 'max-age=%s' % DELAY + headers['x-content-type-options'] = 'nosniff' # safari work around + + if options.cors: + headers['access-control-allow-origin'] = '*' + + if options.html: + headers['content-type'] = 'text/html' + elif options.txt or options.silent: + headers['content-type'] = 'text/plain' + elif options.json: + headers['content-type'] = 'application/json' + elif options.callback: + headers['content-type'] = 'application/javascript' + elif options.csv: + headers['content-type'] = 'text/csv' + headers['content-disposition'] = 'attachment; filename="feed.csv"' + else: + headers['content-type'] = 'text/xml' + + headers['content-type'] += '; charset=utf-8' + + crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) + + # get the work done + url, rss = FeedFetch(url, options) + + start_response(headers['status'], list(headers.items())) + + rss = FeedGather(rss, url, options) + out = FeedFormat(rss, options) + + if options.silent: + return [''] + + else: + return [out] + + +def middleware(func): + " Decorator to turn a function into a wsgi middleware " + # This is called when parsing the "@middleware" code + + def app_builder(app): + # This is called when doing app = cgi_wrapper(app) + + def app_wrap(environ, start_response): + # This is called when a http request is being processed + + return func(environ, start_response, app) + + return app_wrap + + return app_builder + + +@middleware +def cgi_file_handler(environ, start_response, app): + " Simple HTTP server to serve static files (.html, .css, etc.) " + + files = { + '': 'text/html', + 'index.html': 'text/html', + 'sheet.xsl': 'text/xsl'} + + if 'REQUEST_URI' in environ: + url = environ['REQUEST_URI'][1:] + + else: + url = environ['PATH_INFO'][1:] + + if url in files: + headers = {} + + if url == '': + url = 'index.html' + + paths = [os.path.join(sys.prefix, 'share/morss/www', url), + os.path.join(os.path.dirname(__file__), '../www', url)] + + for path in paths: + try: + body = open(path, 'rb').read() + + headers['status'] = '200 OK' + headers['content-type'] = files[url] + start_response(headers['status'], list(headers.items())) + return [body] + + except IOError: + continue + + else: + # the for loop did not return, so here we are, i.e. no file found + headers['status'] = '404 Not found' + start_response(headers['status'], list(headers.items())) + return ['Error %s' % headers['status']] + + else: + return app(environ, start_response) + + +def cgi_get(environ, start_response): + url, options = cgi_parse_environ(environ) + + # get page + req = crawler.adv_get(url=url, timeout=TIMEOUT) + + if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']: + if options.get == 'page': + html = readabilite.parse(req['data'], encoding=req['encoding']) + html.make_links_absolute(req['url']) + + kill_tags = ['script', 'iframe', 'noscript'] + + for tag in kill_tags: + for elem in html.xpath('//'+tag): + elem.getparent().remove(elem) + + output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') + + elif options.get == 'article': + output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) + + else: + raise MorssException('no :get option passed') + + else: + output = req['data'] + + # return html page + headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse + start_response(headers['status'], list(headers.items())) + return [output] + + +dispatch_table = { + 'get': cgi_get, + } + + +@middleware +def cgi_dispatcher(environ, start_response, app): + url, options = cgi_parse_environ(environ) + + for key in dispatch_table.keys(): + if key in options: + return dispatch_table[key](environ, start_response) + + return app(environ, start_response) + + +@middleware +def cgi_error_handler(environ, start_response, app): + try: + return app(environ, start_response) + + except (KeyboardInterrupt, SystemExit): + raise + + except Exception as e: + headers = {'status': '500 Oops', 'content-type': 'text/html'} + start_response(headers['status'], list(headers.items()), sys.exc_info()) + log('ERROR: %s' % repr(e), force=True) + return [cgitb.html(sys.exc_info())] + + +@middleware +def cgi_encode(environ, start_response, app): + out = app(environ, start_response) + return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out] + + +application = cgi_app +application = cgi_file_handler(application) +application = cgi_dispatcher(application) +application = cgi_error_handler(application) +application = cgi_encode(application) diff --git a/morss/cli.py b/morss/cli.py new file mode 100644 index 0000000..98fef96 --- /dev/null +++ b/morss/cli.py @@ -0,0 +1,26 @@ +import sys +import os.path + +from . import crawler +from .morss import FeedFetch, FeedGather, FeedFormat +from .morss import Options, filterOptions, parseOptions +from .morss import log, DEBUG + + +def cli_app(): + options = Options(filterOptions(parseOptions(sys.argv[1:-1]))) + url = sys.argv[-1] + + global DEBUG + DEBUG = options.debug + + crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) + + url, rss = FeedFetch(url, options) + rss = FeedGather(rss, url, options) + out = FeedFormat(rss, options, 'unicode') + + if not options.silent: + print(out) + + log('done') diff --git a/morss/morss.py b/morss/morss.py index 4149396..74b0082 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -1,6 +1,4 @@ -import sys import os -import os.path import time from datetime import datetime @@ -16,20 +14,14 @@ from . import feeds from . import crawler from . import readabilite -import wsgiref.simple_server -import wsgiref.handlers -import cgitb - try: # python 2 from httplib import HTTPException - from urllib import unquote from urlparse import urlparse, urljoin, parse_qs except ImportError: # python 3 from http.client import HTTPException - from urllib.parse import unquote from urllib.parse import urlparse, urljoin, parse_qs MAX_ITEM = 5 # cache-only beyond @@ -42,7 +34,6 @@ DELAY = 10 * 60 # xml cache & ETag cache (in sec) TIMEOUT = 4 # http timeout (in sec) DEBUG = False -PORT = 8080 def filterOptions(options): @@ -437,299 +428,3 @@ def process(url, cache=None, options=None): rss = FeedGather(rss, url, options) return FeedFormat(rss, options, 'unicode') - - -def cgi_parse_environ(environ): - # get options - - if 'REQUEST_URI' in environ: - url = environ['REQUEST_URI'][1:] - else: - url = environ['PATH_INFO'][1:] - - if environ['QUERY_STRING']: - url += '?' + environ['QUERY_STRING'] - - url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url) - - if url.startswith(':'): - split = url.split('/', 1) - - raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:] - - if len(split) > 1: - url = split[1] - else: - url = '' - - else: - raw_options = [] - - # init - options = Options(filterOptions(parseOptions(raw_options))) - - global DEBUG - DEBUG = options.debug - - return (url, options) - - -def cgi_app(environ, start_response): - url, options = cgi_parse_environ(environ) - - headers = {} - - # headers - headers['status'] = '200 OK' - headers['cache-control'] = 'max-age=%s' % DELAY - headers['x-content-type-options'] = 'nosniff' # safari work around - - if options.cors: - headers['access-control-allow-origin'] = '*' - - if options.html: - headers['content-type'] = 'text/html' - elif options.txt or options.silent: - headers['content-type'] = 'text/plain' - elif options.json: - headers['content-type'] = 'application/json' - elif options.callback: - headers['content-type'] = 'application/javascript' - elif options.csv: - headers['content-type'] = 'text/csv' - headers['content-disposition'] = 'attachment; filename="feed.csv"' - else: - headers['content-type'] = 'text/xml' - - headers['content-type'] += '; charset=utf-8' - - crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) - - # get the work done - url, rss = FeedFetch(url, options) - - start_response(headers['status'], list(headers.items())) - - rss = FeedGather(rss, url, options) - out = FeedFormat(rss, options) - - if options.silent: - return [''] - - else: - return [out] - - -def middleware(func): - " Decorator to turn a function into a wsgi middleware " - # This is called when parsing the "@middleware" code - - def app_builder(app): - # This is called when doing app = cgi_wrapper(app) - - def app_wrap(environ, start_response): - # This is called when a http request is being processed - - return func(environ, start_response, app) - - return app_wrap - - return app_builder - - -@middleware -def cgi_file_handler(environ, start_response, app): - " Simple HTTP server to serve static files (.html, .css, etc.) " - - files = { - '': 'text/html', - 'index.html': 'text/html', - 'sheet.xsl': 'text/xsl'} - - if 'REQUEST_URI' in environ: - url = environ['REQUEST_URI'][1:] - - else: - url = environ['PATH_INFO'][1:] - - if url in files: - headers = {} - - if url == '': - url = 'index.html' - - paths = [os.path.join(sys.prefix, 'share/morss/www', url), - os.path.join(os.path.dirname(__file__), '../www', url)] - - for path in paths: - try: - body = open(path, 'rb').read() - - headers['status'] = '200 OK' - headers['content-type'] = files[url] - start_response(headers['status'], list(headers.items())) - return [body] - - except IOError: - continue - - else: - # the for loop did not return, so here we are, i.e. no file found - headers['status'] = '404 Not found' - start_response(headers['status'], list(headers.items())) - return ['Error %s' % headers['status']] - - else: - return app(environ, start_response) - - -def cgi_get(environ, start_response): - url, options = cgi_parse_environ(environ) - - # get page - req = crawler.adv_get(url=url, timeout=TIMEOUT) - - if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']: - if options.get == 'page': - html = readabilite.parse(req['data'], encoding=req['encoding']) - html.make_links_absolute(req['url']) - - kill_tags = ['script', 'iframe', 'noscript'] - - for tag in kill_tags: - for elem in html.xpath('//'+tag): - elem.getparent().remove(elem) - - output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html') - - elif options.get == 'article': - output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) - - else: - raise MorssException('no :get option passed') - - else: - output = req['data'] - - # return html page - headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse - start_response(headers['status'], list(headers.items())) - return [output] - - -dispatch_table = { - 'get': cgi_get, - } - - -@middleware -def cgi_dispatcher(environ, start_response, app): - url, options = cgi_parse_environ(environ) - - for key in dispatch_table.keys(): - if key in options: - return dispatch_table[key](environ, start_response) - - return app(environ, start_response) - - -@middleware -def cgi_error_handler(environ, start_response, app): - try: - return app(environ, start_response) - - except (KeyboardInterrupt, SystemExit): - raise - - except Exception as e: - headers = {'status': '500 Oops', 'content-type': 'text/html'} - start_response(headers['status'], list(headers.items()), sys.exc_info()) - log('ERROR: %s' % repr(e), force=True) - return [cgitb.html(sys.exc_info())] - - -@middleware -def cgi_encode(environ, start_response, app): - out = app(environ, start_response) - return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out] - - -cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app)))) - - -def cli_app(): - options = Options(filterOptions(parseOptions(sys.argv[1:-1]))) - url = sys.argv[-1] - - global DEBUG - DEBUG = options.debug - - crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) - - url, rss = FeedFetch(url, options) - rss = FeedGather(rss, url, options) - out = FeedFormat(rss, options, 'unicode') - - if not options.silent: - print(out) - - log('done') - - -def isInt(string): - try: - int(string) - return True - - except ValueError: - return False - - -def main(): - if 'REQUEST_URI' in os.environ: - # mod_cgi - - app = cgi_app - app = cgi_dispatcher(app) - app = cgi_error_handler(app) - app = cgi_encode(app) - - wsgiref.handlers.CGIHandler().run(app) - - elif len(sys.argv) <= 1 or isInt(sys.argv[1]): - # start internal (basic) http server - - if len(sys.argv) > 1 and isInt(sys.argv[1]): - argPort = int(sys.argv[1]) - if argPort > 0: - port = argPort - - else: - raise MorssException('Port must be positive integer') - - else: - port = PORT - - app = cgi_app - app = cgi_file_handler(app) - app = cgi_dispatcher(app) - app = cgi_error_handler(app) - app = cgi_encode(app) - - print('Serving http://localhost:%s/' % port) - httpd = wsgiref.simple_server.make_server('', port, app) - httpd.serve_forever() - - else: - # as a CLI app - try: - cli_app() - - except (KeyboardInterrupt, SystemExit): - raise - - except Exception as e: - print('ERROR: %s' % e.message) - -if __name__ == '__main__': - main()