From 78cea10eada4b4b9db7faba5d62db55dc7314366 Mon Sep 17 00:00:00 2001 From: pictuga Date: Thu, 9 Apr 2020 18:43:20 +0200 Subject: [PATCH] morss: replace :getpage with :get Also provides readabilite debugging --- morss/morss.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/morss/morss.py b/morss/morss.py index b0d1735..06694f1 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -636,7 +636,7 @@ def cgi_file_handler(environ, start_response, app): return app(environ, start_response) -def cgi_page(environ, start_response): +def cgi_get(environ, start_response): url, options = cgi_parse_environ(environ) # get page @@ -648,28 +648,35 @@ def cgi_page(environ, start_response): data, con, contenttype, encoding = crawler.adv_get(url=url) if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: - html = readabilite.parse(data, encoding=encoding) - html.make_links_absolute(con.geturl()) + if options.get == 'page': + html = readabilite.parse(data, encoding=encoding) + html.make_links_absolute(con.geturl()) - kill_tags = ['script', 'iframe', 'noscript'] + kill_tags = ['script', 'iframe', 'noscript'] - for tag in kill_tags: - for elem in html.xpath('//'+tag): - elem.getparent().remove(elem) + for tag in kill_tags: + for elem in html.xpath('//'+tag): + elem.getparent().remove(elem) - output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') + output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') + + elif options.get == 'article': + output = readabilite.get_article(data, url=con.geturl(), encoding=encoding) + + else: + raise MorssException('no :get option passed') else: - output = None + output = data # return html page - headers = {'status': '200 OK', 'content-type': 'text/html'} + headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'} start_response(headers['status'], list(headers.items())) return [output] dispatch_table = { - 'getpage': cgi_page + 'get': cgi_get, }