From 749acc87fc44f0a21900668781937a64509173f1 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Tue, 28 Apr 2020 22:03:49 +0200
Subject: [PATCH] Centralize url clean up in crawler.py

---
 morss/crawler.py | 17 ++++++++++++++---
 morss/morss.py   | 26 --------------------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/morss/crawler.py b/morss/crawler.py
index fe1af9f..4e68593 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -51,12 +51,15 @@ DEFAULT_UAS = [
     ]
 
 
+PROTOCOL = ['http', 'https']
+
+
 def get(*args, **kwargs):
     return adv_get(*args, **kwargs)[0]
 
 
 def adv_get(url, timeout=None, *args, **kwargs):
-    url = encode_url(url)
+    url = sanitize_url(url)
 
     if timeout is None:
         con = custom_handler(*args, **kwargs).open(url)
@@ -113,8 +116,16 @@ def is_ascii(string):
         return True
 
 
-def encode_url(url):
-    " Escape non-ascii unicode characters "
+def sanitize_url(url):
+    if isinstance(url, bytes):
+        url = url.decode()
+
+    if url.split(':', 1)[0] not in PROTOCOL:
+        url = 'http://' + url
+
+    url = url.replace(' ', '%20')
+
+    # Escape non-ascii unicode characters
     # https://stackoverflow.com/a/4391299
     parts = list(urlparse(url))
 
diff --git a/morss/morss.py b/morss/morss.py
index 4ddab5a..f6fed5d 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -44,8 +44,6 @@ TIMEOUT = 4  # http timeout (in sec)
 DEBUG = False
 PORT = 8080
 
-PROTOCOL = ['http', 'https']
-
 
 def filterOptions(options):
     return options
@@ -297,22 +295,6 @@ def ItemAfter(item, options):
     return item
 
 
-def UrlFix(url):
-    if url is None:
-        raise MorssException('No url provided')
-
-    if isinstance(url, bytes):
-        url = url.decode()
-
-    if urlparse(url).scheme not in PROTOCOL:
-        url = 'http://' + url
-        log(url)
-
-    url = url.replace(' ', '%20')
-
-    return url
-
-
 def FeedFetch(url, options):
     # fetch feed
     delay = DELAY
@@ -456,7 +438,6 @@ def process(url, cache=None, options=None):
     if cache:
         crawler.default_cache = crawler.SQLiteCache(cache)
 
-    url = UrlFix(url)
     rss = FeedFetch(url, options)
     rss = FeedGather(rss, url, options)
 
@@ -529,7 +510,6 @@ def cgi_app(environ, start_response):
     crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
 
     # get the work done
-    url = UrlFix(url)
     rss = FeedFetch(url, options)
 
     if headers['content-type'] == 'text/xml':
@@ -614,11 +594,6 @@ def cgi_get(environ, start_response):
     url, options = cgi_parse_environ(environ)
 
     # get page
-    PROTOCOL = ['http', 'https']
-
-    if urlparse(url).scheme not in ['http', 'https']:
-        url = 'http://' + url
-
     data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
 
     if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
@@ -698,7 +673,6 @@ def cli_app():
 
     crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
 
-    url = UrlFix(url)
     rss = FeedFetch(url, options)
     rss = FeedGather(rss, url, options)
     out = FeedFormat(rss, options, 'unicode')