From cb69e3167f7d5e9a059e4fc3a7b71b4f0025f046 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 28 Apr 2020 14:47:23 +0200 Subject: [PATCH] crawler: accept non-ascii urls Covering one more corner case! --- morss/crawler.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/morss/crawler.py b/morss/crawler.py index 1fbe98a..fe1af9f 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -12,10 +12,14 @@ import random try: # python 2 from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener + from urllib import quote + from urlparse import urlparse, urlunparse import mimetools except ImportError: # python 3 from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener + from urllib.parse import quote + from urllib.parse import urlparse, urlunparse import email try: @@ -52,6 +56,8 @@ def get(*args, **kwargs): def adv_get(url, timeout=None, *args, **kwargs): + url = encode_url(url) + if timeout is None: con = custom_handler(*args, **kwargs).open(url) @@ -95,6 +101,34 @@ def custom_handler(follow=None, delay=None, encoding=None): return build_opener(*handlers) +def is_ascii(string): + # there's a native function in py3, but home-made fix for backward compatibility + try: + string.encode('ascii') + + except UnicodeError: + return False + + else: + return True + + +def encode_url(url): + " Escape non-ascii unicode characters " + # https://stackoverflow.com/a/4391299 + parts = list(urlparse(url)) + + for i in range(len(parts)): + if not is_ascii(parts[i]): + if i == 1: + parts[i] = parts[i].encode('idna').decode('ascii') + + else: + parts[i] = quote(parts[i].encode('utf-8')) + + return urlunparse(parts) + + class DebugHandler(BaseHandler): handler_order = 2000