From 749acc87fc44f0a21900668781937a64509173f1 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 28 Apr 2020 22:03:49 +0200 Subject: [PATCH] Centralize url clean up in crawler.py --- morss/crawler.py | 17 ++++++++++++++--- morss/morss.py | 26 -------------------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index fe1af9f..4e68593 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -51,12 +51,15 @@ DEFAULT_UAS = [ ] +PROTOCOL = ['http', 'https'] + + def get(*args, **kwargs): return adv_get(*args, **kwargs)[0] def adv_get(url, timeout=None, *args, **kwargs): - url = encode_url(url) + url = sanitize_url(url) if timeout is None: con = custom_handler(*args, **kwargs).open(url) @@ -113,8 +116,16 @@ def is_ascii(string): return True -def encode_url(url): - " Escape non-ascii unicode characters " +def sanitize_url(url): + if isinstance(url, bytes): + url = url.decode() + + if url.split(':', 1)[0] not in PROTOCOL: + url = 'http://' + url + + url = url.replace(' ', '%20') + + # Escape non-ascii unicode characters # https://stackoverflow.com/a/4391299 parts = list(urlparse(url)) diff --git a/morss/morss.py b/morss/morss.py index 4ddab5a..f6fed5d 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -44,8 +44,6 @@ TIMEOUT = 4 # http timeout (in sec) DEBUG = False PORT = 8080 -PROTOCOL = ['http', 'https'] - def filterOptions(options): return options @@ -297,22 +295,6 @@ def ItemAfter(item, options): return item -def UrlFix(url): - if url is None: - raise MorssException('No url provided') - - if isinstance(url, bytes): - url = url.decode() - - if urlparse(url).scheme not in PROTOCOL: - url = 'http://' + url - log(url) - - url = url.replace(' ', '%20') - - return url - - def FeedFetch(url, options): # fetch feed delay = DELAY @@ -456,7 +438,6 @@ def process(url, cache=None, options=None): if cache: crawler.default_cache = crawler.SQLiteCache(cache) - url = UrlFix(url) rss = FeedFetch(url, options) rss = FeedGather(rss, url, options) @@ -529,7 +510,6 @@ def cgi_app(environ, start_response): crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) # get the work done - url = UrlFix(url) rss = FeedFetch(url, options) if headers['content-type'] == 'text/xml': @@ -614,11 +594,6 @@ def cgi_get(environ, start_response): url, options = cgi_parse_environ(environ) # get page - PROTOCOL = ['http', 'https'] - - if urlparse(url).scheme not in ['http', 'https']: - url = 'http://' + url - data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT) if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: @@ -698,7 +673,6 @@ def cli_app(): crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) - url = UrlFix(url) rss = FeedFetch(url, options) rss = FeedGather(rss, url, options) out = FeedFormat(rss, options, 'unicode')