299 lines
8.3 KiB

# This file is part of morss
# Copyright (C) 2013-2020 pictuga <>
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <>.
import cgitb
import mimetypes
import os.path
import re
import sys
import wsgiref.handlers
import wsgiref.simple_server
import wsgiref.util
import lxml.etree
# python 2
from urllib import unquote
except ImportError:
# python 3
from urllib.parse import unquote
from . import caching, crawler, readabilite
from .morss import (DELAY, TIMEOUT, FeedFetch, FeedFormat, FeedGather,
MorssException, Options, log)
from .util import data_path
PORT = int(os.getenv('PORT', 8000))
def parse_options(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
for option in options:
split = option.split('=', 1)
if len(split) > 1:
out[split[0]] = unquote(split[1]).replace('|', '/') # | -> / for backward compatibility (and Apache)
out[split[0]] = True
return out
def request_uri(environ):
if 'REQUEST_URI' in environ:
# when running on Apache/uwsgi
url = environ['REQUEST_URI']
elif 'RAW_URI' in environ:
# gunicorn
url = environ['RAW_URI']
# when using other servers
url = environ['PATH_INFO']
if environ['QUERY_STRING']:
url += '?' + environ['QUERY_STRING']
return url
def cgi_parse_environ(environ):
# get options
url = request_uri(environ)[1:]
url = re.sub(r'^(cgi/)?(|', '', url)
if url.startswith(':'):
parts = url.split('/', 1)
raw_options = parts[0].split(':')[1:]
url = parts[1] if len(parts) > 1 else ''
raw_options = []
# init
options = Options(parse_options(raw_options))
return (url, options)
def cgi_app(environ, start_response):
url, options = cgi_parse_environ(environ)
headers = {}
# headers
headers['status'] = '200 OK'
headers['cache-control'] = 'max-age=%s' % DELAY
headers['x-content-type-options'] = 'nosniff' # safari work around
if options.cors:
headers['access-control-allow-origin'] = '*'
if options.format == 'html':
headers['content-type'] = 'text/html'
elif options.txt or options.silent:
headers['content-type'] = 'text/plain'
elif options.format == 'json':
headers['content-type'] = 'application/json'
elif options.callback:
headers['content-type'] = 'application/javascript'
elif options.format == 'csv':
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
headers['content-type'] = 'text/xml'
headers['content-type'] += '; charset=utf-8'
# get the work done
url, rss = FeedFetch(url, options)
start_response(headers['status'], list(headers.items()))
rss = FeedGather(rss, url, options)
out = FeedFormat(rss, options)
if options.silent:
return ['']
return [out]
def middleware(func):
" Decorator to turn a function into a wsgi middleware "
# This is called when parsing the "@middleware" code
def app_builder(app):
# This is called when doing app = cgi_wrapper(app)
def app_wrap(environ, start_response):
# This is called when a http request is being processed
return func(environ, start_response, app)
return app_wrap
return app_builder
def cgi_file_handler(environ, start_response, app):
" Simple HTTP server to serve static files (.html, .css, etc.) "
url = request_uri(environ)[1:]
if url == '':
url = 'index.html'
if re.match(r'^/?([a-zA-Z0-9_-][a-zA-Z0-9\._-]+/?)*$', url):
# if it is a legitimate url (no funny relative paths)
path = data_path('www', url)
f = open(path, 'rb')
except IOError:
# problem with file (cannot open or not found)
# file successfully open
headers = {}
headers['status'] = '200 OK'
headers['content-type'] = mimetypes.guess_type(path)[0] or 'application/octet-stream'
start_response(headers['status'], list(headers.items()))
return wsgiref.util.FileWrapper(f)
# regex didn't validate or no file found
return app(environ, start_response)
def cgi_get(environ, start_response):
url, options = cgi_parse_environ(environ)
# get page
if options['get'] in ('page', 'article'):
req = crawler.adv_get(url=url, timeout=TIMEOUT)
if req['contenttype'] in crawler.MIMETYPE['html']:
if options['get'] == 'page':
html = readabilite.parse(req['data'], encoding=req['encoding'])
kill_tags = ['script', 'iframe', 'noscript']
for tag in kill_tags:
for elem in html.xpath('//'+tag):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')
else: # i.e. options['get'] == 'article'
output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)
elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
output = req['data']
raise MorssException('unsupported mimetype')
raise MorssException('no :get option passed')
# return html page
headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
start_response(headers['status'], list(headers.items()))
return [output]
dispatch_table = {
'get': cgi_get,
def cgi_dispatcher(environ, start_response, app):
url, options = cgi_parse_environ(environ)
for key in dispatch_table.keys():
if key in options:
return dispatch_table[key](environ, start_response)
return app(environ, start_response)
def cgi_error_handler(environ, start_response, app):
return app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
except Exception as e:
headers = {'status': '404 Not Found', 'content-type': 'text/html', 'x-morss-error': repr(e)}
start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e))
return [cgitb.html(sys.exc_info())]
def cgi_encode(environ, start_response, app):
out = app(environ, start_response)
return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out]
application = cgi_app
application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)
def cgi_handle_request():
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
class WSGIRequestHandlerRequestUri(wsgiref.simple_server.WSGIRequestHandler):
def get_environ(self):
env = wsgiref.simple_server.WSGIRequestHandler.get_environ(self)
env['REQUEST_URI'] = self.path
return env
def cgi_start_server():
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application, handler_class=WSGIRequestHandlerRequestUri)
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):