10 changed files with 31 additions and 270 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -10,7 +10,6 @@ steps:
  - pip3 install --no-cache-dir .[full] .[dev]
  - isort --check-only --diff .
  - pylint morss --rcfile=.pylintrc --disable=C,R,W --fail-under=8
  - pytest --cov=morss tests
 ---
 kind: pipeline
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -32,18 +32,18 @@ from .caching import default_cache
 try:
    # python 2
-    from urllib import quote, unquote
+    from urllib import quote
    from httplib import HTTPMessage
    from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
                         Request, addinfourl, build_opener, parse_http_list,
                         parse_keqv_list)
-    from urlparse import urlsplit
+    from urlparse import urlparse, urlunparse
 except ImportError:
    # python 3
    from email import message_from_string
    from http.client import HTTPMessage
-    from urllib.parse import quote, unquote, urlsplit
+    from urllib.parse import quote, urlparse, urlunparse
    from urllib.request import (BaseHandler, HTTPCookieProcessor,
                                HTTPRedirectHandler, Request, addinfourl,
                                build_opener, parse_http_list, parse_keqv_list)
@ -151,10 +151,22 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
    return build_opener(*handlers)
 def is_ascii(string):
    # there's a native function in py3, but home-made fix for backward compatibility
    try:
        string.encode('ascii')
    except UnicodeError:
        return False
    else:
        return True
 def sanitize_url(url):
    # make sure the url is unicode, i.e. not bytes
    if isinstance(url, bytes):
-        url = url.decode('utf-8')
+        url = url.decode()
    # make sure there's a protocol (http://)
    if url.split(':', 1)[0] not in PROTOCOL:
@ -163,20 +175,22 @@ def sanitize_url(url):
    # turns out some websites have really badly fomatted urls (fix http:/badurl)
    url = re.sub('^(https?):/([^/])', r'\1://\2', url)
-    # escape non-ascii unicode characters (also encode spaces as %20)
+    # escape spaces
-    parts = urlsplit(url)
+    url = url.replace(' ', '%20')
-    parts = parts._replace(
+    # escape non-ascii unicode characters
-        netloc=parts.netloc.replace(
+    # https://stackoverflow.com/a/4391299
-            parts.hostname,
+    parts = list(urlparse(url))
            parts.hostname.encode('idna').decode('ascii')
            ),
        path=quote(unquote(parts.path).encode('utf-8')),
        query=quote(unquote(parts.query).encode('utf-8')),
        fragment=quote(unquote(parts.fragment).encode('utf-8')),
    )
-    return parts.geturl()
+    for i in range(len(parts)):
        if not is_ascii(parts[i]):
            if i == 1:
                parts[i] = parts[i].encode('idna').decode('ascii')
            else:
                parts[i] = quote(parts[i].encode('utf-8'))
    return urlunparse(parts)
 class RespDataHandler(BaseHandler):
--- a/setup.py
+++ b/setup.py
@ -23,7 +23,7 @@ setup(
    install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
    extras_require = {
        'full': ['pymysql', 'redis', 'diskcache', 'gunicorn', 'setproctitle'],
-        'dev': ['pylint', 'pytest'],
+        'dev': ['pylint']
    },
    python_requires = '>=2.7',
    package_data = {package_name: ['feedify.ini']},
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,60 +0,0 @@
 import os
 import os.path
 import threading
 import pytest
 try:
    # python2
    from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
    from SimpleHTTPServer import SimpleHTTPRequestHandler
 except:
    # python3
    from http.server import (BaseHTTPRequestHandler, HTTPServer,
                             SimpleHTTPRequestHandler)
 class HTTPReplayHandler(SimpleHTTPRequestHandler):
    " Serves pages saved alongside with headers. See `curl --http1.1 -is http://...` "
    directory = os.path.join(os.path.dirname(__file__), './samples/')
    __init__ = BaseHTTPRequestHandler.__init__
    def do_GET(self):
        path = self.translate_path(self.path)
        if os.path.isdir(path):
            f = self.list_directory(path)
        else:
            f = open(path, 'rb')
        try:
            self.copyfile(f, self.wfile)
        finally:
            f.close()
 class MuteHTTPServer(HTTPServer):
    def handle_error(self, request, client_address):
        # mute errors
        pass
 def make_server(port=8888):
    print('Serving http://localhost:%s/' % port)
    return MuteHTTPServer(('', port), RequestHandlerClass=HTTPReplayHandler)
@pytest.fixture
 def replay_server():
    httpd = make_server()
    thread = threading.Thread(target=httpd.serve_forever)
    thread.start()
    yield
    httpd.shutdown()
    thread.join()
 if __name__ == '__main__':
    httpd = make_server()
    httpd.serve_forever()
--- a/tests/samples/feed-atom-utf-8.txt
+++ b/tests/samples/feed-atom-utf-8.txt
@ -1,16 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/xml; charset=utf-8
 <?xml version='1.0' encoding='utf-8'?>
 <feed xmlns="http://www.w3.org/2005/Atom">
 	<title>!TITLE!</title>
 	<subtitle>!DESC!</subtitle>
 	<entry>
 		<title>!ITEM_TITLE!</title>
 		<summary>!ITEM_DESC!</summary>
 		<content type="html">!ITEM_CONTENT!</content>
 		<link href="!ITEM_LINK!"/>
 		<updated>2022-01-01T00:00:01+01:00</updated>
 		<published>2022-01-01T00:00:02+01:00</published>
 	</entry>
 </feed>
--- a/tests/samples/feed-atom03-utf-8.txt
+++ b/tests/samples/feed-atom03-utf-8.txt
@ -1,15 +0,0 @@
 HTTP/1.1 200 OK
 content-type: application/xml
 <?xml version='1.0' encoding='utf-8' ?>
 <feed version='0.3' xmlns='http://purl.org/atom/ns#'>
 	<title>!TITLE!</title>
 	<subtitle>!DESC!</subtitle>
 	<entry>
 		<title>!ITEM_TITLE!</title>
 		<link rel='alternate' type='text/html' href='!ITEM_LINK!' />
 		<summary>!ITEM_DESC!</summary>
 		<content>!ITEM_CONTENT!</content>
 		<issued>2022-01-01T00:00:01+01:00</issued> <!-- FIXME -->
 	</entry>
 </feed>
--- a/tests/samples/feed-html-utf-8.txt
+++ b/tests/samples/feed-html-utf-8.txt
@ -1,22 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/html; charset=utf-8
 <html>
 <head></head>
 <body>
 <div id="header">
 	<h1>!TITLE!</h1>
 	<p>!DESC!</p>
 </div>
 <div id="content">
 	<div class="item">
 		<a target="_blank" href="!ITEM_LINK!">!ITEM_TITLE!</a>
 		<div class="desc">!ITEM_DESC!</div>
 		<div class="content">!ITEM_CONTENT!</div>
 	</div>
 </div>
 </body>
 </html>
--- a/tests/samples/feed-json-utf-8.txt
+++ b/tests/samples/feed-json-utf-8.txt
@ -1,16 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: application/json; charset=utf-8
 {
 	"title": "!TITLE!",
 	"desc": "!DESC!",
 	"items": [
 		{
 			"title": "!ITEM_TITLE!",
 			"time": "2022-01-01T00:00:01+0100",
 			"url": "!ITEM_LINK!",
 			"desc": "!ITEM_DESC!",
 			"content": "!ITEM_CONTENT!"
 		}
 	]
 }
--- a/tests/samples/feed-rss-channel-utf-8.txt
+++ b/tests/samples/feed-rss-channel-utf-8.txt
@ -1,17 +0,0 @@
 HTTP/1.1 200 OK
 Content-Type: text/xml; charset=utf-8
 <?xml version='1.0' encoding='utf-8'?>
 <rss xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>!TITLE!</title>
    <description>!DESC!</description>
    <item>
      <title>!ITEM_TITLE!</title>
      <pubDate>Mon, 01 Jan 2022 00:00:01 +0100</pubDate>
      <link>!ITEM_LINK!</link>
      <description>!ITEM_DESC!</description>
      <content:encoded>!ITEM_CONTENT!</content:encoded>
    </item>
  </channel>
 </rss>
--- a/tests/test_feeds.py
+++ b/tests/test_feeds.py
@ -1,106 +0,0 @@
 import pytest
 from morss.crawler import adv_get
 from morss.feeds import *
 def get_feed(url):
    url = 'http://localhost:8888/%s' % url
    out = adv_get(url)
    feed = parse(out['data'], url=url, encoding=out['encoding'])
    return feed
 def check_feed(feed):
    # NB. time and updated not covered
    assert feed.title == '!TITLE!'
    assert feed.desc == '!DESC!'
    assert feed[0] == feed.items[0]
    assert feed[0].title == '!ITEM_TITLE!'
    assert feed[0].link == '!ITEM_LINK!'
    assert '!ITEM_DESC!' in feed[0].desc # broader test due to possible inclusion of surrounding <div> in xml
    assert '!ITEM_CONTENT!' in feed[0].content
 def check_output(feed):
    output = feed.tostring()
    assert '!TITLE!' in output
    assert '!DESC!' in output
    assert '!ITEM_TITLE!' in output
    assert '!ITEM_LINK!' in output
    assert '!ITEM_DESC!' in output
    assert '!ITEM_CONTENT!' in output
 def check_change(feed):
    feed.title = '!TITLE2!'
    feed.desc = '!DESC2!'
    feed[0].title = '!ITEM_TITLE2!'
    feed[0].link = '!ITEM_LINK2!'
    feed[0].desc = '!ITEM_DESC2!'
    feed[0].content = '!ITEM_CONTENT2!'
    assert feed.title == '!TITLE2!'
    assert feed.desc == '!DESC2!'
    assert feed[0].title == '!ITEM_TITLE2!'
    assert feed[0].link == '!ITEM_LINK2!'
    assert '!ITEM_DESC2!' in feed[0].desc
    assert '!ITEM_CONTENT2!' in feed[0].content
 def check_add(feed):
    feed.append({
        'title': '!ITEM_TITLE3!',
        'link': '!ITEM_LINK3!',
        'desc': '!ITEM_DESC3!',
        'content': '!ITEM_CONTENT3!',
    })
    assert feed[1].title == '!ITEM_TITLE3!'
    assert feed[1].link == '!ITEM_LINK3!'
    assert '!ITEM_DESC3!' in feed[1].desc
    assert '!ITEM_CONTENT3!' in feed[1].content
 each_format = pytest.mark.parametrize('url', [
    'feed-rss-channel-utf-8.txt', 'feed-atom-utf-8.txt',
    'feed-atom03-utf-8.txt', 'feed-json-utf-8.txt', 'feed-html-utf-8.txt',
    ])
 each_check = pytest.mark.parametrize('check', [
    check_feed, check_output, check_change, check_add,
    ])
@each_format
@each_check
 def test_parse(replay_server, url, check):
    feed = get_feed(url)
    check(feed)
@each_format
@each_check
 def test_convert_rss(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedXML)
    check(feed)
@each_format
@each_check
 def test_convert_json(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedJSON)
    check(feed)
@each_format
@each_check
 def test_convert_html(replay_server, url, check):
    feed = get_feed(url)
    feed = feed.convert(FeedHTML)
    if len(feed) > 1:
        # remove the 'blank' default html item
        del feed[0]
    check(feed)
@each_format
 def test_convert_csv(replay_server, url):
    # only csv output, not csv feed, check therefore differnet
    feed = get_feed(url)
    output = feed.tocsv()
    assert '!ITEM_TITLE!' in output
    assert '!ITEM_LINK!' in output
    assert '!ITEM_DESC!' in output
    assert '!ITEM_CONTENT!' in output