crawler: improve handling of non-ascii urls
parent
da81edc651
commit
d6b90448f3
|
@ -32,18 +32,18 @@ from .caching import default_cache
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from urllib import quote
|
from urllib import quote, unquote
|
||||||
|
|
||||||
from httplib import HTTPMessage
|
from httplib import HTTPMessage
|
||||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
Request, addinfourl, build_opener, parse_http_list,
|
Request, addinfourl, build_opener, parse_http_list,
|
||||||
parse_keqv_list)
|
parse_keqv_list)
|
||||||
from urlparse import urlparse, urlunparse
|
from urlparse import urlsplit
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from email import message_from_string
|
from email import message_from_string
|
||||||
from http.client import HTTPMessage
|
from http.client import HTTPMessage
|
||||||
from urllib.parse import quote, urlparse, urlunparse
|
from urllib.parse import quote, unquote, urlsplit
|
||||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
HTTPRedirectHandler, Request, addinfourl,
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
build_opener, parse_http_list, parse_keqv_list)
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
|
@ -151,22 +151,10 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(string):
|
|
||||||
# there's a native function in py3, but home-made fix for backward compatibility
|
|
||||||
try:
|
|
||||||
string.encode('ascii')
|
|
||||||
|
|
||||||
except UnicodeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url):
|
||||||
# make sure the url is unicode, i.e. not bytes
|
# make sure the url is unicode, i.e. not bytes
|
||||||
if isinstance(url, bytes):
|
if isinstance(url, bytes):
|
||||||
url = url.decode()
|
url = url.decode('utf-8')
|
||||||
|
|
||||||
# make sure there's a protocol (http://)
|
# make sure there's a protocol (http://)
|
||||||
if url.split(':', 1)[0] not in PROTOCOL:
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
|
@ -175,22 +163,20 @@ def sanitize_url(url):
|
||||||
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||||
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||||
|
|
||||||
# escape spaces
|
# escape non-ascii unicode characters (also encode spaces as %20)
|
||||||
url = url.replace(' ', '%20')
|
parts = urlsplit(url)
|
||||||
|
|
||||||
# escape non-ascii unicode characters
|
parts = parts._replace(
|
||||||
# https://stackoverflow.com/a/4391299
|
netloc=parts.netloc.replace(
|
||||||
parts = list(urlparse(url))
|
parts.hostname,
|
||||||
|
parts.hostname.encode('idna').decode('ascii')
|
||||||
|
),
|
||||||
|
path=quote(unquote(parts.path).encode('utf-8')),
|
||||||
|
query=quote(unquote(parts.query).encode('utf-8')),
|
||||||
|
fragment=quote(unquote(parts.fragment).encode('utf-8')),
|
||||||
|
)
|
||||||
|
|
||||||
for i in range(len(parts)):
|
return parts.geturl()
|
||||||
if not is_ascii(parts[i]):
|
|
||||||
if i == 1:
|
|
||||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
|
||||||
|
|
||||||
else:
|
|
||||||
parts[i] = quote(parts[i].encode('utf-8'))
|
|
||||||
|
|
||||||
return urlunparse(parts)
|
|
||||||
|
|
||||||
|
|
||||||
class RespDataHandler(BaseHandler):
|
class RespDataHandler(BaseHandler):
|
||||||
|
|
Loading…
Reference in New Issue