Compare commits

..

3 Commits

Author SHA1 Message Date
pictuga cb69e3167f crawler: accept non-ascii urls
Covering one more corner case!
2020-04-28 14:47:23 +02:00
pictuga c3f06da947 morss: process(): specify encoding for clarity 2020-04-28 14:45:00 +02:00
pictuga 44a3e0edc4 readabilite: specify in- and out-going encoding 2020-04-28 14:44:35 +02:00
3 changed files with 42 additions and 8 deletions

View File

@ -12,10 +12,14 @@ import random
try:
# python 2
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib import quote
from urlparse import urlparse, urlunparse
import mimetools
except ImportError:
# python 3
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
from urllib.parse import quote
from urllib.parse import urlparse, urlunparse
import email
try:
@ -52,6 +56,8 @@ def get(*args, **kwargs):
def adv_get(url, timeout=None, *args, **kwargs):
url = encode_url(url)
if timeout is None:
con = custom_handler(*args, **kwargs).open(url)
@ -95,6 +101,34 @@ def custom_handler(follow=None, delay=None, encoding=None):
return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def encode_url(url):
" Escape non-ascii unicode characters "
# https://stackoverflow.com/a/4391299
parts = list(urlparse(url))
for i in range(len(parts)):
if not is_ascii(parts[i]):
if i == 1:
parts[i] = parts[i].encode('idna').decode('ascii')
else:
parts[i] = quote(parts[i].encode('utf-8'))
return urlunparse(parts)
class DebugHandler(BaseHandler):
handler_order = 2000

View File

@ -260,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
log('non-text page')
return True
out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
if out is not None:
item.content = out
@ -460,7 +460,7 @@ def process(url, cache=None, options=None):
rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options)
return FeedFormat(rss, options)
return FeedFormat(rss, options, 'unicode')
def cgi_parse_environ(environ):
@ -635,7 +635,7 @@ def cgi_get(environ, start_response):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
elif options.get == 'article':
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
else:
raise MorssException('no :get option passed')

View File

@ -316,10 +316,10 @@ def get_best_node(ranked_grades):
return lowest
def get_article(data, url=None, encoding=None, debug=False, threshold=5):
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
" Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding)
html = parse(data, encoding_in)
score_all(html)
scores = rank_grades(get_all_scores(html))
@ -341,7 +341,7 @@ def get_article(data, url=None, encoding=None, debug=False, threshold=5):
if url:
best.make_links_absolute(url)
return lxml.etree.tostring(best if not debug else html, pretty_print=True)
return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out)
if __name__ == '__main__':
@ -349,7 +349,7 @@ if __name__ == '__main__':
from . import crawler
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
article = get_article(data, url=con.geturl(), encoding=encoding)
article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
if not sys.flags.interactive:
print(article.decode(encoding))
print(article)