3 changed files with 21 additions and 24 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -59,9 +59,7 @@ except NameError:
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
    'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
-    'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
-    'json': ['application/json'],
-    }
+    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}


 DEFAULT_UAS = [
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@ -207,10 +207,8 @@ def clean_root(root, keep_threshold=None):
 def clean_node(node, keep_threshold=None):
    parent = node.getparent()

-    # remove comments
    if (isinstance(node, lxml.html.HtmlComment)
            or isinstance(node, lxml.html.HtmlProcessingInstruction)):
-        parent.remove(node)
        return

    if parent is None:
@ -244,6 +242,11 @@ def clean_node(node, keep_threshold=None):
        parent.remove(node)
        return

+    # remove comments
+    if isinstance(node, lxml.html.HtmlComment) or isinstance(node, lxml.html.HtmlProcessingInstruction):
+        parent.remove(node)
+        return
+
    # remove if too many kids & too high link density
    wc = count_words(node.text_content())
    if wc != 0 and len(list(node.iter())) > 3:
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@ -192,10 +192,9 @@ def cgi_get(environ, start_response):
    url, options = cgi_parse_environ(environ)

    # get page
-    if options['get'] in ('page', 'article'):
    req = crawler.adv_get(url=url, timeout=TIMEOUT)

-        if req['contenttype'] in crawler.MIMETYPE['html']:
+    if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']:
        if options['get'] == 'page':
            html = readabilite.parse(req['data'], encoding=req['encoding'])
            html.make_links_absolute(req['url'])
@ -208,20 +207,17 @@ def cgi_get(environ, start_response):

            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8', method='html')

-            else: # i.e. options['get'] == 'article'
+        elif options['get'] == 'article':
            output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug)

-        elif req['contenttype'] in crawler.MIMETYPE['xml'] + crawler.MIMETYPE['rss'] + crawler.MIMETYPE['json']:
-            output = req['data']
-
-        else:
-            raise MorssException('unsupported mimetype')
-
        else:
            raise MorssException('no :get option passed')

+    else:
+        output = req['data']
+
    # return html page
-    headers = {'status': '200 OK', 'content-type': req['contenttype'], 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
+    headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN'} # SAMEORIGIN to avoid potential abuse
    start_response(headers['status'], list(headers.items()))
    return [output]