From 271ac8f80f5761da43f5345bec6a923b25677cf1 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 2 May 2020 19:18:01 +0200 Subject: [PATCH] crawler: comment code a bit --- morss/crawler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/morss/crawler.py b/morss/crawler.py index 8adf495..63f6c5d 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -123,18 +123,21 @@ def is_ascii(string): def sanitize_url(url): + # make sure the url is unicode, i.e. not bytes if isinstance(url, bytes): url = url.decode() + # make sure there's a protocol (http://) if url.split(':', 1)[0] not in PROTOCOL: url = 'http://' + url # turns out some websites have really badly fomatted urls (fix http:/badurl) url = re.sub('^(https?):/([^/])', r'\1://\2', url) + # escape spaces url = url.replace(' ', '%20') - # Escape non-ascii unicode characters + # escape non-ascii unicode characters # https://stackoverflow.com/a/4391299 parts = list(urlparse(url))