Centralize url clean up in crawler.py
parent
c186188557
commit
749acc87fc
|
@ -51,12 +51,15 @@ DEFAULT_UAS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
PROTOCOL = ['http', 'https']
|
||||||
|
|
||||||
|
|
||||||
def get(*args, **kwargs):
|
def get(*args, **kwargs):
|
||||||
return adv_get(*args, **kwargs)[0]
|
return adv_get(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
def adv_get(url, timeout=None, *args, **kwargs):
|
def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
url = encode_url(url)
|
url = sanitize_url(url)
|
||||||
|
|
||||||
if timeout is None:
|
if timeout is None:
|
||||||
con = custom_handler(*args, **kwargs).open(url)
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
@ -113,8 +116,16 @@ def is_ascii(string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def encode_url(url):
|
def sanitize_url(url):
|
||||||
" Escape non-ascii unicode characters "
|
if isinstance(url, bytes):
|
||||||
|
url = url.decode()
|
||||||
|
|
||||||
|
if url.split(':', 1)[0] not in PROTOCOL:
|
||||||
|
url = 'http://' + url
|
||||||
|
|
||||||
|
url = url.replace(' ', '%20')
|
||||||
|
|
||||||
|
# Escape non-ascii unicode characters
|
||||||
# https://stackoverflow.com/a/4391299
|
# https://stackoverflow.com/a/4391299
|
||||||
parts = list(urlparse(url))
|
parts = list(urlparse(url))
|
||||||
|
|
||||||
|
|
|
@ -44,8 +44,6 @@ TIMEOUT = 4 # http timeout (in sec)
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
PORT = 8080
|
PORT = 8080
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https']
|
|
||||||
|
|
||||||
|
|
||||||
def filterOptions(options):
|
def filterOptions(options):
|
||||||
return options
|
return options
|
||||||
|
@ -297,22 +295,6 @@ def ItemAfter(item, options):
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
def UrlFix(url):
|
|
||||||
if url is None:
|
|
||||||
raise MorssException('No url provided')
|
|
||||||
|
|
||||||
if isinstance(url, bytes):
|
|
||||||
url = url.decode()
|
|
||||||
|
|
||||||
if urlparse(url).scheme not in PROTOCOL:
|
|
||||||
url = 'http://' + url
|
|
||||||
log(url)
|
|
||||||
|
|
||||||
url = url.replace(' ', '%20')
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
def FeedFetch(url, options):
|
def FeedFetch(url, options):
|
||||||
# fetch feed
|
# fetch feed
|
||||||
delay = DELAY
|
delay = DELAY
|
||||||
|
@ -456,7 +438,6 @@ def process(url, cache=None, options=None):
|
||||||
if cache:
|
if cache:
|
||||||
crawler.default_cache = crawler.SQLiteCache(cache)
|
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||||
|
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
||||||
|
@ -529,7 +510,6 @@ def cgi_app(environ, start_response):
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||||
|
|
||||||
# get the work done
|
# get the work done
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
|
|
||||||
if headers['content-type'] == 'text/xml':
|
if headers['content-type'] == 'text/xml':
|
||||||
|
@ -614,11 +594,6 @@ def cgi_get(environ, start_response):
|
||||||
url, options = cgi_parse_environ(environ)
|
url, options = cgi_parse_environ(environ)
|
||||||
|
|
||||||
# get page
|
# get page
|
||||||
PROTOCOL = ['http', 'https']
|
|
||||||
|
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
|
||||||
url = 'http://' + url
|
|
||||||
|
|
||||||
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
|
data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT)
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
|
@ -698,7 +673,6 @@ def cli_app():
|
||||||
|
|
||||||
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||||
|
|
||||||
url = UrlFix(url)
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
out = FeedFormat(rss, options, 'unicode')
|
out = FeedFormat(rss, options, 'unicode')
|
||||||
|
|
Loading…
Reference in New Issue