Compare commits
No commits in common. "cb69e3167f7d5e9a059e4fc3a7b71b4f0025f046" and "4a9b505499790b999007f712317902ad58071924" have entirely different histories.
cb69e3167f
...
4a9b505499
|
@ -12,14 +12,10 @@ import random
|
||||||
try:
|
try:
|
||||||
# python 2
|
# python 2
|
||||||
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
from urllib2 import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||||
from urllib import quote
|
|
||||||
from urlparse import urlparse, urlunparse
|
|
||||||
import mimetools
|
import mimetools
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# python 3
|
# python 3
|
||||||
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
from urllib.request import BaseHandler, HTTPCookieProcessor, Request, addinfourl, parse_keqv_list, parse_http_list, build_opener
|
||||||
from urllib.parse import quote
|
|
||||||
from urllib.parse import urlparse, urlunparse
|
|
||||||
import email
|
import email
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -56,8 +52,6 @@ def get(*args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def adv_get(url, timeout=None, *args, **kwargs):
|
def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
url = encode_url(url)
|
|
||||||
|
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
con = custom_handler(*args, **kwargs).open(url)
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
|
||||||
|
@ -101,34 +95,6 @@ def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(string):
|
|
||||||
# there's a native function in py3, but home-made fix for backward compatibility
|
|
||||||
try:
|
|
||||||
string.encode('ascii')
|
|
||||||
|
|
||||||
except UnicodeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def encode_url(url):
|
|
||||||
" Escape non-ascii unicode characters "
|
|
||||||
# https://stackoverflow.com/a/4391299
|
|
||||||
parts = list(urlparse(url))
|
|
||||||
|
|
||||||
for i in range(len(parts)):
|
|
||||||
if not is_ascii(parts[i]):
|
|
||||||
if i == 1:
|
|
||||||
parts[i] = parts[i].encode('idna').decode('ascii')
|
|
||||||
|
|
||||||
else:
|
|
||||||
parts[i] = quote(parts[i].encode('utf-8'))
|
|
||||||
|
|
||||||
return urlunparse(parts)
|
|
||||||
|
|
||||||
|
|
||||||
class DebugHandler(BaseHandler):
|
class DebugHandler(BaseHandler):
|
||||||
handler_order = 2000
|
handler_order = 2000
|
||||||
|
|
||||||
|
|
|
@ -260,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
|
out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
|
||||||
|
|
||||||
if out is not None:
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
@ -460,7 +460,7 @@ def process(url, cache=None, options=None):
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
||||||
return FeedFormat(rss, options, 'unicode')
|
return FeedFormat(rss, options)
|
||||||
|
|
||||||
|
|
||||||
def cgi_parse_environ(environ):
|
def cgi_parse_environ(environ):
|
||||||
|
@ -635,7 +635,7 @@ def cgi_get(environ, start_response):
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
elif options.get == 'article':
|
elif options.get == 'article':
|
||||||
output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug)
|
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
raise MorssException('no :get option passed')
|
||||||
|
|
|
@ -316,10 +316,10 @@ def get_best_node(ranked_grades):
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=False, threshold=5):
|
def get_article(data, url=None, encoding=None, debug=False, threshold=5):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding_in)
|
html = parse(data, encoding)
|
||||||
score_all(html)
|
score_all(html)
|
||||||
scores = rank_grades(get_all_scores(html))
|
scores = rank_grades(get_all_scores(html))
|
||||||
|
|
||||||
|
@ -341,7 +341,7 @@ def get_article(data, url=None, encoding_in=None, encoding_out='unicode', debug=
|
||||||
if url:
|
if url:
|
||||||
best.make_links_absolute(url)
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
return lxml.etree.tostring(best if not debug else html, pretty_print=True, encoding=encoding_out)
|
return lxml.etree.tostring(best if not debug else html, pretty_print=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -349,7 +349,7 @@ if __name__ == '__main__':
|
||||||
from . import crawler
|
from . import crawler
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
||||||
article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode')
|
article = get_article(data, url=con.geturl(), encoding=encoding)
|
||||||
|
|
||||||
if not sys.flags.interactive:
|
if not sys.flags.interactive:
|
||||||
print(article)
|
print(article.decode(encoding))
|
||||||
|
|
Loading…
Reference in New Issue