crawler: handle http:/ (single slash)
Fixing one more corner case! malayalam.oneindia.commaster
parent
a2c4691090
commit
64e41b807d
|
@ -129,6 +129,9 @@ def sanitize_url(url):
|
||||||
if url.split(':', 1)[0] not in PROTOCOL:
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
|
|
||||||
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||||
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||||
|
|
||||||
url = url.replace(' ', '%20')
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
# Escape non-ascii unicode characters
|
# Escape non-ascii unicode characters
|
||||||
|
|
Loading…
Reference in New Issue