Compare commits
No commits in common. "4d64afe9cb496d40d07f38dce63e1d0605fcff14" and "32645548c23f17e0236fa2eaf832df21c41df111" have entirely different histories.
4d64afe9cb
...
32645548c2
|
@ -32,7 +32,7 @@ from .caching import default_cache
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from urllib import quote
|
from urllib import quote, unquote
|
||||||
|
|
||||||
from httplib import HTTPMessage
|
from httplib import HTTPMessage
|
||||||
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
||||||
|
@ -43,7 +43,7 @@ except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from email import message_from_string
|
from email import message_from_string
|
||||||
from http.client import HTTPMessage
|
from http.client import HTTPMessage
|
||||||
from urllib.parse import quote, urlsplit
|
from urllib.parse import quote, unquote, urlsplit
|
||||||
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
||||||
HTTPRedirectHandler, Request, addinfourl,
|
HTTPRedirectHandler, Request, addinfourl,
|
||||||
build_opener, parse_http_list, parse_keqv_list)
|
build_opener, parse_http_list, parse_keqv_list)
|
||||||
|
@ -151,28 +151,6 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(string):
|
|
||||||
# there's a native function in py3, but home-made fix for backward compatibility
|
|
||||||
try:
|
|
||||||
string.encode('ascii')
|
|
||||||
|
|
||||||
except UnicodeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def soft_quote(string):
|
|
||||||
" url-quote only when not a valid ascii string "
|
|
||||||
|
|
||||||
if is_ascii(string):
|
|
||||||
return string
|
|
||||||
|
|
||||||
else:
|
|
||||||
return quote(string.encode('utf-8'))
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_url(url):
|
def sanitize_url(url):
|
||||||
# make sure the url is unicode, i.e. not bytes
|
# make sure the url is unicode, i.e. not bytes
|
||||||
if isinstance(url, bytes):
|
if isinstance(url, bytes):
|
||||||
|
@ -185,10 +163,7 @@ def sanitize_url(url):
|
||||||
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
||||||
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
||||||
|
|
||||||
# escape spaces
|
# escape non-ascii unicode characters (also encode spaces as %20)
|
||||||
url = url.replace(' ', '%20')
|
|
||||||
|
|
||||||
# escape non-ascii unicode characters
|
|
||||||
parts = urlsplit(url)
|
parts = urlsplit(url)
|
||||||
|
|
||||||
parts = parts._replace(
|
parts = parts._replace(
|
||||||
|
@ -196,9 +171,9 @@ def sanitize_url(url):
|
||||||
parts.hostname,
|
parts.hostname,
|
||||||
parts.hostname.encode('idna').decode('ascii')
|
parts.hostname.encode('idna').decode('ascii')
|
||||||
),
|
),
|
||||||
path=soft_quote(parts.path),
|
path=quote(unquote(parts.path).encode('utf-8')),
|
||||||
query=soft_quote(parts.query),
|
query=quote(unquote(parts.query).encode('utf-8')),
|
||||||
fragment=soft_quote(parts.fragment),
|
fragment=quote(unquote(parts.fragment).encode('utf-8')),
|
||||||
)
|
)
|
||||||
|
|
||||||
return parts.geturl()
|
return parts.geturl()
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain
|
|
||||||
|
|
||||||
success
|
|
|
@ -1,3 +0,0 @@
|
||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: /200-ok.txt
|
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: ./200-ok.txt
|
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
HTTP/1.1 301 Moved Permanently
|
|
||||||
location: http://localhost:8888/200-ok.txt
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 308 Permanent Redirect
|
|
||||||
location: /200-ok.txt
|
|
||||||
|
|
||||||
/200-ok.txt
|
|
|
@ -1,8 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=gb2312
|
|
||||||
|
|
||||||
成功
|
|
|
@ -1,10 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html
|
|
||||||
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta charset="gb2312"/></head>
|
|
||||||
<body>
|
|
||||||
成功
|
|
||||||
</body></html>
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=iso-8859-1
|
|
||||||
|
|
||||||
succès
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain
|
|
||||||
|
|
||||||
succès
|
|
|
@ -1,4 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/plain; charset=UTF-8
|
|
||||||
|
|
||||||
succès
|
|
Binary file not shown.
|
@ -1,3 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
refresh: 0;url=/200-ok.txt
|
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
|
@ -1,8 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
|
@ -1,8 +0,0 @@
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-type: text/html; charset=UTF-8
|
|
||||||
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
|
|
||||||
<body>meta redirect</body>
|
|
||||||
</html>
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,62 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from morss.crawler import *
|
|
||||||
|
|
||||||
|
|
||||||
def test_get(replay_server):
|
|
||||||
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
|
|
||||||
|
|
||||||
def test_adv_get(replay_server):
|
|
||||||
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('before,after', [
|
|
||||||
(b'http://localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('http:/localhost:8888/', 'http://localhost:8888/'),
|
|
||||||
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
|
|
||||||
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
|
|
||||||
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
|
|
||||||
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
|
|
||||||
])
|
|
||||||
def test_sanitize_url(before, after):
|
|
||||||
assert sanitize_url(before) == after
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
|
|
||||||
def test_size_limit_handler(replay_server, opener):
|
|
||||||
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
|
|
||||||
def test_gzip_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
|
|
||||||
@pytest.mark.parametrize('url', [
|
|
||||||
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
|
|
||||||
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
|
|
||||||
'enc-utf-8-header.txt',
|
|
||||||
])
|
|
||||||
def test_encoding_fix_handler(replay_server, opener, url):
|
|
||||||
out = adv_get('http://localhost:8888/%s' % url)
|
|
||||||
out = out['data'].decode(out['encoding'])
|
|
||||||
assert 'succes' in out or 'succès' in out or '成功' in out
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
|
|
||||||
def test_alternate_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
|
|
||||||
def test_http_equiv_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
|
|
||||||
def test_http_all_redirect_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
|
|
||||||
def test_http_refresh_handler(replay_server, opener):
|
|
||||||
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'
|
|
Loading…
Reference in New Issue