3 changed files with 8 additions and 42 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -12,14 +12,10 @@ import random
 try:
    # python 2
    from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
    from urllib import quote
    from urlparse import urlparse, urlunparse
    import mimetools
 except ImportError:
    # python 3
    from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
    from urllib.parse import quote
    from urllib.parse import urlparse, urlunparse
    import email
 try:
@ -56,8 +52,6 @@ def get(*args, **kwargs):
 def adv_get(url, timeout=None, *args, **kwargs):
    url = encode_url(url)
    if timeout is None:
        con = custom_handler(*args, **kwargs).open(url)
@ -101,34 +95,6 @@ def custom_handler(follow=None, delay=None, encoding=None):
    return build_opener(*handlers)
 def is_ascii(string):
    # there's a native function in py3, but home-made fix for backward compatibility
    try:
        string.encode('ascii')
    except UnicodeError:
        return False
    else:
        return True
 def encode_url(url):
    " Escape non-ascii unicode characters "
    # https://stackoverflow.com/a/4391299
    parts = list(urlparse(url))
    for i in range(len(parts)):
        if not is_ascii(parts[i]):
            if i == 1:
                parts[i] = parts[i].encode('idna').decode('ascii')
            else:
                parts[i] = quote(parts[i].encode('utf-8'))
    return urlunparse(parts)
 class DebugHandler(BaseHandler):
    handler_order = 2000
--- a/morss/morss.py
+++ b/morss/morss.py
@ -260,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
        log('non-text page')
        return True
-    out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
+    out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
    if out is not None:
        item.content = out
@ -460,7 +460,7 @@ def process(url, cache=None, options=None):
    rss = FeedFetch(url, options)
    rss = FeedGather(rss, url, options)
-    return FeedFormat(rss, options, 'unicode')
+    return FeedFormat(rss, options)
 def cgi_parse_environ(environ):
@ -635,7 +635,7 @@ def cgi_get(environ, start_response):
            output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
        elif options.get == 'article':
-            output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
+            output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
        else:
            raise MorssException('no :get option passed')
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@ -316,10 +316,10 @@ def get_best_node(ranked_grades):
    return lowest
-def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
+def get_article(data, url=None, encoding=None, debug=False, threshold=5):
    " Input a raw html string, returns a raw html string of the article "
-    html = parse(data, encoding_in)
+    html = parse(data, encoding)
    score_all(html)
    scores = rank_grades(get_all_scores(html))
@ -341,7 +341,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
    if url:
        best.make_links_absolute(url)
-    return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out)
+    return lxml.etree.tostring(best if not debug else html, pretty_print=True)
 if __name__ == '__main__':
@ -349,7 +349,7 @@ if __name__ == '__main__':
    from . import crawler
    data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
-    article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
+    article = get_article(data, url=con.geturl(), encoding=encoding)
    if not sys.flags.interactive:
-        print(article)
+        print(article.decode(encoding))