Compare commits

..

2 Commits

Author SHA1 Message Date
pictuga 4d64afe9cb crawler: fix regression from d6b90448f3
continuous-integration/drone/push Build is failing Details
2022-02-01 23:18:16 +01:00
pictuga d3b623482d pytest: crawler 2022-02-01 23:16:43 +01:00
19 changed files with 9391 additions and 6 deletions

View File

@ -32,7 +32,7 @@ from .caching import default_cache
try: try:
# python 2 # python 2
from urllib import quote, unquote from urllib import quote
from httplib import HTTPMessage from httplib import HTTPMessage
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
@ -43,7 +43,7 @@ except ImportError:
# python 3 # python 3
from email import message_from_string from email import message_from_string
from http.client import HTTPMessage from http.client import HTTPMessage
from urllib.parse import quote, unquote, urlsplit from urllib.parse import quote, urlsplit
from urllib.request import (BaseHandler, HTTPCookieProcessor, from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl, HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list) build_opener, parse_http_list, parse_keqv_list)
@ -151,6 +151,28 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
return build_opener(*handlers) return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def soft_quote(string):
" url-quote only when not a valid ascii string "
if is_ascii(string):
return string
else:
return quote(string.encode('utf-8'))
def sanitize_url(url): def sanitize_url(url):
# make sure the url is unicode, i.e. not bytes # make sure the url is unicode, i.e. not bytes
if isinstance(url, bytes): if isinstance(url, bytes):
@ -163,7 +185,10 @@ def sanitize_url(url):
# turns out some websites have really badly fomatted urls (fix http:/badurl) # turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url) url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape non-ascii unicode characters (also encode spaces as %20) # escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
parts = urlsplit(url) parts = urlsplit(url)
parts = parts._replace( parts = parts._replace(
@ -171,9 +196,9 @@ def sanitize_url(url):
parts.hostname, parts.hostname,
parts.hostname.encode('idna').decode('ascii') parts.hostname.encode('idna').decode('ascii')
), ),
path=quote(unquote(parts.path).encode('utf-8')), path=soft_quote(parts.path),
query=quote(unquote(parts.query).encode('utf-8')), query=soft_quote(parts.query),
fragment=quote(unquote(parts.fragment).encode('utf-8')), fragment=soft_quote(parts.fragment),
) )
return parts.geturl() return parts.geturl()

View File

@ -0,0 +1,4 @@
HTTP/1.1 200 OK
content-type: text/plain
success

View File

@ -0,0 +1,3 @@
HTTP/1.1 301 Moved Permanently
location: /200-ok.txt

View File

@ -0,0 +1,3 @@
HTTP/1.1 301 Moved Permanently
location: ./200-ok.txt

View File

@ -0,0 +1,3 @@
HTTP/1.1 301 Moved Permanently
location: http://localhost:8888/200-ok.txt

View File

@ -0,0 +1,4 @@
HTTP/1.1 308 Permanent Redirect
location: /200-ok.txt
/200-ok.txt

View File

@ -0,0 +1,8 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><link rel="alternate" type="application/rss+xml" href="/200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@ -0,0 +1,4 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=gb2312
成功

View File

@ -0,0 +1,10 @@
HTTP/1.1 200 OK
content-type: text/html
<!DOCTYPE html>
<html>
<head><meta charset="gb2312"/></head>
<body>
成功
</body></html>

View File

@ -0,0 +1,4 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=iso-8859-1
succès

View File

@ -0,0 +1,4 @@
HTTP/1.1 200 OK
content-type: text/plain
succès

View File

@ -0,0 +1,4 @@
HTTP/1.1 200 OK
content-type: text/plain; charset=UTF-8
succès

Binary file not shown.

View File

@ -0,0 +1,3 @@
HTTP/1.1 200 OK
refresh: 0;url=/200-ok.txt

View File

@ -0,0 +1,8 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = /200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@ -0,0 +1,8 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = ./200-ok.txt" /></head>
<body>meta redirect</body>
</html>

View File

@ -0,0 +1,8 @@
HTTP/1.1 200 OK
content-type: text/html; charset=UTF-8
<!DOCTYPE html>
<html>
<head><meta http-equiv="refresh" content="2; url = http://localhost:8888/200-ok.txt" /></head>
<body>meta redirect</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,62 @@
import pytest
from morss.crawler import *
def test_get(replay_server):
assert get('http://localhost:8888/200-ok.txt') == b'success\r\n'
def test_adv_get(replay_server):
assert adv_get('http://localhost:8888/200-ok.txt')['data'] == b'success\r\n'
@pytest.mark.parametrize('before,after', [
(b'http://localhost:8888/', 'http://localhost:8888/'),
('localhost:8888/', 'http://localhost:8888/'),
('http:/localhost:8888/', 'http://localhost:8888/'),
('http://localhost:8888/&/', 'http://localhost:8888/&/'),
('http://localhost:8888/ /', 'http://localhost:8888/%20/'),
('http://localhost-€/€/', 'http://xn--localhost--077e/%E2%82%AC/'),
('http://localhost-€:8888/€/', 'http://xn--localhost--077e:8888/%E2%82%AC/'),
])
def test_sanitize_url(before, after):
assert sanitize_url(before) == after
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(SizeLimitHandler(500*1024))])
def test_size_limit_handler(replay_server, opener):
assert len(opener.open('http://localhost:8888/size-1MiB.txt').read()) == 500*1024
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(GZIPHandler())])
def test_gzip_handler(replay_server, opener):
assert opener.open('http://localhost:8888/gzip.txt').read() == b'success\n'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(EncodingFixHandler())])
@pytest.mark.parametrize('url', [
'enc-gb2312-header.txt', 'enc-gb2312-meta.txt', #'enc-gb2312-missing.txt',
'enc-iso-8859-1-header.txt', 'enc-iso-8859-1-missing.txt',
'enc-utf-8-header.txt',
])
def test_encoding_fix_handler(replay_server, opener, url):
out = adv_get('http://localhost:8888/%s' % url)
out = out['data'].decode(out['encoding'])
assert 'succes' in out or 'succès' in out or '成功' in out
@pytest.mark.parametrize('opener', [custom_opener(follow='rss'), build_opener(AlternateHandler(MIMETYPE['rss']))])
def test_alternate_handler(replay_server, opener):
assert opener.open('http://localhost:8888/alternate-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPEquivHandler(), HTTPRefreshHandler())])
def test_http_equiv_handler(replay_server, opener):
assert opener.open('http://localhost:8888/meta-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/meta-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/meta-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPAllRedirectHandler())])
def test_http_all_redirect_handler(replay_server, opener):
assert opener.open('http://localhost:8888/308-redirect.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-abs.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-rel.txt').geturl() == 'http://localhost:8888/200-ok.txt'
assert opener.open('http://localhost:8888/301-redirect-url.txt').geturl() == 'http://localhost:8888/200-ok.txt'
@pytest.mark.parametrize('opener', [custom_opener(), build_opener(HTTPRefreshHandler())])
def test_http_refresh_handler(replay_server, opener):
assert opener.open('http://localhost:8888/header-refresh.txt').geturl() == 'http://localhost:8888/200-ok.txt'