Compare commits

...

2 Commits

Author SHA1 Message Date
pictuga 271ac8f80f crawler: comment code a bit 2020-05-02 19:18:01 +02:00
pictuga 64e41b807d crawler: handle http:/ (single slash)
Fixing one more corner case! malayalam.oneindia.com
2020-05-02 19:17:15 +02:00
1 changed files with 7 additions and 1 deletions

View File

@ -123,15 +123,21 @@ def is_ascii(string):
def sanitize_url(url): def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes): if isinstance(url, bytes):
url = url.decode() url = url.decode()
# make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL: if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
# Escape non-ascii unicode characters # escape non-ascii unicode characters
# https://stackoverflow.com/a/4391299 # https://stackoverflow.com/a/4391299
parts = list(urlparse(url)) parts = list(urlparse(url))