split morss.py into __main__/cgi/cli.py

Should hopefully allow cleaner code in the future
master
pictuga 2020-08-21 22:17:55 +02:00
parent c6d3a0eb53
commit c6b52e625f
6 changed files with 341 additions and 307 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from morss import main, cgi_standalone_app as application from morss.__main__ import main
from morss.cgi import application
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,2 +1,3 @@
# ran on `import morss` # ran on `import morss`
from .morss import * from .morss import *
from .cgi import application

View File

@ -1,5 +1,74 @@
# ran on `python -m morss` # ran on `python -m morss`
from .morss import main
import os
import sys
from . import cgi
from . import cli
from .morss import MorssException
import wsgiref.simple_server
import wsgiref.handlers
PORT = 8080
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi
app = cgi.cgi_app
app = cgi.cgi_dispatcher(app)
app = cgi.cgi_error_handler(app)
app = cgi.cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
# start internal (basic) http server
if len(sys.argv) > 1 and isInt(sys.argv[1]):
argPort = int(sys.argv[1])
if argPort > 0:
port = argPort
else:
raise MorssException('Port must be positive integer')
else:
port = PORT
app = cgi.cgi_app
app = cgi.cgi_file_handler(app)
app = cgi.cgi_dispatcher(app)
app = cgi.cgi_error_handler(app)
app = cgi.cgi_encode(app)
print('Serving http://localhost:%s/' % port)
httpd = wsgiref.simple_server.make_server('', port, app)
httpd.serve_forever()
else:
# as a CLI app
try:
cli.cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

242
morss/cgi.py 100644
View File

@ -0,0 +1,242 @@
import sys
import os.path
import re
import cgitb
try:
# python 2
from urllib import unquote
except ImportError:
# python 3
from urllib.parse import unquote
from . import crawler
from . import readabilite
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, filterOptions, parseOptions
from .morss import log, DELAY, DEBUG, MorssException
from . import cred
def cgi_parse_environ(environ):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
raw_options = []
# init
options = Options(filterOptions(parseOptions(raw_options)))
global DEBUG
DEBUG = options.debug
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
else:
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
files = {
'': 'text/html',
'index.html': 'text/html',
'sheet.xsl': 'text/xsl'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)]
for path in paths:
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
continue
else:
# the for loop did not return, so here we are, i.e. no file found
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
else:
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try:
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e), force=True)
return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
application = cgi_app
application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)

26
morss/cli.py 100644
View File

@ -0,0 +1,26 @@
import sys
import os.path
from . import crawler
from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, filterOptions, parseOptions
from .morss import log, DEBUG
def cli_app():
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')
if not options.silent:
print(out)
log('done')

View File

@ -1,6 +1,4 @@
import sys
import os import os
import os.path
import time import time
from datetime import datetime from datetime import datetime
@ -16,20 +14,14 @@ from . import feeds
from . import crawler from . import crawler
from . import readabilite from . import readabilite
import wsgiref.simple_server
import wsgiref.handlers
import cgitb
try: try:
# python 2 # python 2
from httplib import HTTPException from httplib import HTTPException
from urllib import unquote
from urlparse import urlparse, urljoin, parse_qs from urlparse import urlparse, urljoin, parse_qs
except ImportError: except ImportError:
# python 3 # python 3
from http.client import HTTPException from http.client import HTTPException
from urllib.parse import unquote
from urllib.parse import urlparse, urljoin, parse_qs from urllib.parse import urlparse, urljoin, parse_qs
MAX_ITEM = 5 # cache-only beyond MAX_ITEM = 5 # cache-only beyond
@ -42,7 +34,6 @@ DELAY = 10 * 60 # xml cache & ETag cache (in sec)
TIMEOUT = 4 # http timeout (in sec) TIMEOUT = 4 # http timeout (in sec)
DEBUG = False DEBUG = False
PORT = 8080
def filterOptions(options): def filterOptions(options):
@ -437,299 +428,3 @@ def process(url, cache=None, options=None):
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
return FeedFormat(rss, options, 'unicode') return FeedFormat(rss, options, 'unicode')
def cgi_parse_environ(environ):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url)
if url.startswith(':'):
split = url.split('/', 1)
raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:]
if len(split) > 1:
url = split[1]
else:
url = ''
else:
raw_options = []
# init
options = Options(filterOptions(parseOptions(raw_options)))
global DEBUG
DEBUG = options.debug
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.html:
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
else:
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
@middleware
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
files = {
'': 'text/html',
'index.html': 'text/html',
'sheet.xsl': 'text/xsl'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
paths = [os.path.join(sys.prefix, 'share/morss/www', url),
os.path.join(os.path.dirname(__file__), '../www', url)]
for path in paths:
try:
body = open(path, 'rb').read()
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], list(headers.items()))
return [body]
except IOError:
continue
else:
# the for loop did not return, so here we are, i.e. no file found
headers['status'] = '404 Not found'
start_response(headers['status'], list(headers.items()))
return ['Error %s' % headers['status']]
else:
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
if options.get == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
html.make_links_absolute(req['url'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
elem.getparent().remove(elem)
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
elif options.get == 'article':
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')
else:
output = req['data']
# return html page
headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
}
@middleware
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
@middleware
def cgi_error_handler(environ, start_response, app):
try:
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e), force=True)
return [cgitb.html(sys.exc_info())]
@middleware
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app))))
def cli_app():
options = Options(filterOptions(parseOptions(sys.argv[1:-1])))
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
url, rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options, 'unicode')
if not options.silent:
print(out)
log('done')
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1 or isInt(sys.argv[1]):
# start internal (basic) http server
if len(sys.argv) > 1 and isInt(sys.argv[1]):
argPort = int(sys.argv[1])
if argPort > 0:
port = argPort
else:
raise MorssException('Port must be positive integer')
else:
port = PORT
app = cgi_app
app = cgi_file_handler(app)
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
print('Serving http://localhost:%s/' % port)
httpd = wsgiref.simple_server.make_server('', port, app)
httpd.serve_forever()
else:
# as a CLI app
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except Exception as e:
print('ERROR: %s' % e.message)
if __name__ == '__main__':
main()