From 64e41b807d5605dbe52b2320fd017f9534506022 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 2 May 2020 19:17:15 +0200 Subject: [PATCH] crawler: handle http:/ (single slash) Fixing one more corner case! malayalam.oneindia.com --- morss/crawler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/morss/crawler.py b/morss/crawler.py index cc98838..8adf495 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -129,6 +129,9 @@ def sanitize_url(url): if url.split(':', 1)[0] not in PROTOCOL: url = 'http://' + url + # turns out some websites have really badly fomatted urls (fix http:/badurl) + url = re.sub('^(https?):/([^/])', r'\1://\2', url) + url = url.replace(' ', '%20') # Escape non-ascii unicode characters