Compare commits
2 Commits
a2c4691090
...
271ac8f80f
Author | SHA1 | Date |
---|---|---|
pictuga | 271ac8f80f | |
pictuga | 64e41b807d |
|
@ -123,15 +123,21 @@ def is_ascii(string):
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url):
|
||||||
|
# make sure the url is unicode, i.e. not bytes
|
||||||
if isinstance(url, bytes):
|
if isinstance(url, bytes):
|
||||||
url = url.decode()
|
url = url.decode()
|
||||||
|
|
||||||
|
# make sure there's a protocol (http://)
|
||||||
if url.split(':', 1)[0] not in PROTOCOL:
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
|
|
||||||
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||||
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||||
|
|
||||||
|
# escape spaces
|
||||||
url = url.replace(' ', '%20')
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
# Escape non-ascii unicode characters
|
# escape non-ascii unicode characters
|
||||||
# https://stackoverflow.com/a/4391299
|
# https://stackoverflow.com/a/4391299
|
||||||
parts = list(urlparse(url))
|
parts = list(urlparse(url))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue