crawler: fix regression from d6b90448f3

pytest: crawler
2022-02-01 23:18:16 +01:00 · 2022-02-01 23:16:43 +01:00
19 changed files with 9391 additions and 6 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -32,7 +32,7 @@ from .caching import default_cache
 try:
    # python 2
-    from urllib import quote, unquote
+    from urllib import quote
    from httplib import HTTPMessage
    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
@@ -43,7 +43,7 @@ except ImportError:
    # python 3
    from email import message_from_string
    from http.client import HTTPMessage
-    from urllib.parse import quote, unquote, urlsplit
+    from urllib.parse import quote, urlsplit
    from urllib.request import (BaseHandler, HTTPCookieProcessor,
                                HTTPRedirectHandler, Request, addinfourl,
                                build_opener, parse_http_list, parse_keqv_list)
@@ -151,6 +151,28 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
    return build_opener(*handlers)
 def is_ascii(string):
    # there's a native function in py3, but home-made fix for backward compatibility
    try:
        string.encode('ascii')
    except UnicodeError:
        return False
    else:
        return True
 def soft_quote(string):
    " url-quote only when not a valid ascii string "
    if is_ascii(string):
        return string
    else:
        return quote(string.encode('utf-8'))
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
@@ -163,7 +185,10 @@ def sanitize_url(url):
    # turns out some websites have really badly fomatted urls (fix http:/badurl)
    url = re.sub('^(https?):/([^/])', r'\1://\2', url)
-    # escape non-ascii unicode characters (also encode spaces as %20)
+    # escape spaces
    url = url.replace(' ', '%20')
    # escape non-ascii unicode characters
    parts = urlsplit(url)
    parts = parts._replace(
@@ -171,9 +196,9 @@ def sanitize_url(url):
            parts.hostname,
            parts.hostname.encode('idna').decode('ascii')
            ),
-        path=quote(unquote(parts.path).encode('utf-8')),
+        path=soft_quote(parts.path),
-        query=quote(unquote(parts.query).encode('utf-8')),
+        query=soft_quote(parts.query),
-        fragment=quote(unquote(parts.fragment).encode('utf-8')),
+        fragment=soft_quote(parts.fragment),
    )
    return parts.geturl()
--- a/tests/samples/200-ok.txt
+++ b/tests/samples/200-ok.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 200 OK
 content-type: text/plain
 success
--- a/tests/samples/301-redirect-abs.txt
+++ b/tests/samples/301-redirect-abs.txt
@@ -0,0 +1,3 @@
 HTTP/1.1 301 Moved Permanently
 location: /200-ok.txt
--- a/tests/samples/301-redirect-rel.txt
+++ b/tests/samples/301-redirect-rel.txt
@@ -0,0 +1,3 @@
 HTTP/1.1 301 Moved Permanently
 location: ./200-ok.txt
--- a/tests/samples/301-redirect-url.txt
+++ b/tests/samples/301-redirect-url.txt
@@ -0,0 +1,3 @@
 HTTP/1.1 301 Moved Permanently
 location: http://localhost:8888/200-ok.txt
--- a/tests/samples/308-redirect.txt
+++ b/tests/samples/308-redirect.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 308 Permanent Redirect
 location: /200-ok.txt
 /200-ok.txt
--- a/tests/samples/alternate-abs.txt
+++ b/tests/samples/alternate-abs.txt
@@ -0,0 +1,8 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/enc-gb2312-header.txt
+++ b/tests/samples/enc-gb2312-header.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=gb2312
 <EFBFBD>ɹ<EFBFBD>
--- a/tests/samples/enc-gb2312-meta.txt
+++ b/tests/samples/enc-gb2312-meta.txt
@@ -0,0 +1,10 @@
 HTTP/1.1 200 OK
 content-type: text/html
 <!DOCTYPE html>
 <html>
 <head><meta charset="gb2312"/></head>
 <body>
 <EFBFBD>ɹ<EFBFBD>
 </body></html>
--- a/tests/samples/enc-iso-8859-1-header.txt
+++ b/tests/samples/enc-iso-8859-1-header.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=iso-8859-1
 succ<EFBFBD>s
--- a/tests/samples/enc-iso-8859-1-missing.txt
+++ b/tests/samples/enc-iso-8859-1-missing.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 200 OK
 content-type: text/plain
 succ<EFBFBD>s
--- a/tests/samples/enc-utf-8-header.txt
+++ b/tests/samples/enc-utf-8-header.txt
@@ -0,0 +1,4 @@
 HTTP/1.1 200 OK
 content-type: text/plain; charset=UTF-8
 succès
--- a/tests/samples/gzip.txt
+++ b/tests/samples/gzip.txt
--- a/tests/samples/header-refresh.txt
+++ b/tests/samples/header-refresh.txt
@@ -0,0 +1,3 @@
 HTTP/1.1 200 OK
 refresh: 0;url=/200-ok.txt
--- a/tests/samples/meta-redirect-abs.txt
+++ b/tests/samples/meta-redirect-abs.txt
@@ -0,0 +1,8 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/meta-redirect-rel.txt
+++ b/tests/samples/meta-redirect-rel.txt
@@ -0,0 +1,8 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/meta-redirect-url.txt
+++ b/tests/samples/meta-redirect-url.txt
@@ -0,0 +1,8 @@
 HTTP/1.1 200 OK
 content-type: text/html; charset=UTF-8
 <!DOCTYPE html>
 <html>
 <head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
 <body>meta redirect</body>
 </html>
--- a/tests/samples/size-1MiB.txt
+++ b/tests/samples/size-1MiB.txt
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -0,0 +1,62 @@
 import pytest
 from morss.crawler import *
 def test_get(replay_server):
    assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
 def test_adv_get(replay_server):
    assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
@pytest.mark.parametrize('before,after', [
    (b'http://localhost:8888/',     'http://localhost:8888/'),
    ('localhost:8888/',             'http://localhost:8888/'),
    ('http:/localhost:8888/',       'http://localhost:8888/'),
    ('http://localhost:8888/&/',     'http://localhost:8888/&/'),
    ('http://localhost:8888/ /',    'http://localhost:8888/%20/'),
    ('http://localhost-€/€/',       'http://xn--localhost--077e/%E2%82%AC/'),
    ('http://localhost-€:8888/€/',  'http://xn--localhost--077e:8888/%E2%82%AC/'),
    ])
 def test_sanitize_url(before, after):
    assert sanitize_url(before) == after
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
 def test_size_limit_handler(replay_server, opener):
    assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
 def test_gzip_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
@pytest.mark.parametrize('url', [
    'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
    'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
    'enc-utf-8-header.txt',
    ])
 def test_encoding_fix_handler(replay_server, opener, url):
    out = adv_get('http://localhost:8888/%s' % url)
    out = out['data'].decode(out['encoding'])
    assert 'succes' in out or 'succès' in out or '成功' in out
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
 def test_alternate_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
 def test_http_equiv_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
 def test_http_all_redirect_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
    assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
 def test_http_refresh_handler(replay_server, opener):
    assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
Author	SHA1	Message	Date
pictuga	4d64afe9cb	crawler: fix regression from `d6b90448f3` Some checks failed continuous-integration/drone/push Build is failing Details	2022-02-01 23:18:16 +01:00
pictuga	d3b623482d	pytest: crawler	2022-02-01 23:16:43 +01:00
		`@@ -0,0 +1,3 @@`
							`HTTP/1.1 301 Moved Permanently`
							`location: /200-ok.txt`
		`@@ -0,0 +1,3 @@`
							`HTTP/1.1 301 Moved Permanently`
							`location: ./200-ok.txt`
		`@@ -0,0 +1,3 @@`
							`HTTP/1.1 301 Moved Permanently`
							`location: http://localhost:8888/200-ok.txt`
		`@@ -0,0 +1,3 @@`
							`HTTP/1.1 200 OK`
							`refresh: 0;url=/200-ok.txt`