crawler: improve handling of non-ascii urls

master
pictuga 2022-01-30 23:27:49 +01:00
parent da81edc651
commit d6b90448f3
1 changed files with 16 additions and 30 deletions

View File

@ -32,18 +32,18 @@ from .caching import default_cache
try: try:
# python 2 # python 2
from urllib import quote from urllib import quote, unquote
from httplib import HTTPMessage from httplib import HTTPMessage
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list, Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list) parse_keqv_list)
from urlparse import urlparse, urlunparse from urlparse import urlsplit
except ImportError: except ImportError:
# python 3 # python 3
from email import message_from_string from email import message_from_string
from http.client import HTTPMessage from http.client import HTTPMessage
from urllib.parse import quote, urlparse, urlunparse from urllib.parse import quote, unquote, urlsplit
from urllib.request import (BaseHandler, HTTPCookieProcessor, from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl, HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list) build_opener, parse_http_list, parse_keqv_list)
@ -151,22 +151,10 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
return build_opener(*handlers) return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def sanitize_url(url): def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes # make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes): if isinstance(url, bytes):
url = url.decode() url = url.decode('utf-8')
# make sure there's a protocol (http://) # make sure there's a protocol (http://)
if url.split(':', 1)[0] not in PROTOCOL: if url.split(':', 1)[0] not in PROTOCOL:
@ -175,22 +163,20 @@ def sanitize_url(url):
# turns out some websites have really badly fomatted urls (fix http:/badurl) # turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url) url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces # escape non-ascii unicode characters (also encode spaces as %20)
url = url.replace(' ', '%20') parts = urlsplit(url)
# escape non-ascii unicode characters parts = parts._replace(
# https://stackoverflow.com/a/4391299 netloc=parts.netloc.replace(
parts = list(urlparse(url)) parts.hostname,
parts.hostname.encode('idna').decode('ascii')
),
path=quote(unquote(parts.path).encode('utf-8')),
query=quote(unquote(parts.query).encode('utf-8')),
fragment=quote(unquote(parts.fragment).encode('utf-8')),
)
for i in range(len(parts)): return parts.geturl()
if not is_ascii(parts[i]):
if i == 1:
parts[i] = parts[i].encode('idna').decode('ascii')
else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class RespDataHandler(BaseHandler): class RespDataHandler(BaseHandler):