crawler: handle http:/ (single slash)

Fixing one more corner case! malayalam.oneindia.com
master
pictuga 2020-05-02 19:17:15 +02:00
parent a2c4691090
commit 64e41b807d
1 changed files with 3 additions and 0 deletions

View File

@ -129,6 +129,9 @@ def sanitize_url(url):
if url.split(':', 1)[0] not in PROTOCOL: if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
# Escape non-ascii unicode characters # Escape non-ascii unicode characters