From c27c38f7c7257728084b54e398389a0d67666cae Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 28 Apr 2020 22:29:07 +0200 Subject: [PATCH] crawler: return dict instead of tuple --- morss/crawler.py | 14 ++++++++++---- morss/feeds.py | 4 ++-- morss/morss.py | 26 +++++++++++++------------- morss/readabilite.py | 4 ++-- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 4e68593..cc98838 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -55,7 +55,7 @@ PROTOCOL = ['http', 'https'] def get(*args, **kwargs): - return adv_get(*args, **kwargs)[0] + return adv_get(*args, **kwargs)['data'] def adv_get(url, timeout=None, *args, **kwargs): @@ -72,7 +72,13 @@ def adv_get(url, timeout=None, *args, **kwargs): contenttype = con.info().get('Content-Type', '').split(';')[0] encoding= detect_encoding(data, con) - return data, con, contenttype, encoding + return { + 'data':data, + 'url': con.geturl(), + 'con': con, + 'contenttype': contenttype, + 'encoding': encoding + } def custom_handler(follow=None, delay=None, encoding=None): @@ -621,7 +627,7 @@ class MySQLCacheHandler(BaseCache): if __name__ == '__main__': - data, con, contenttype, encoding = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') + req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') if not sys.flags.interactive: - print(data.decode(encoding)) + print(req['data'].decode(req['encoding'])) diff --git a/morss/feeds.py b/morss/feeds.py index 6b9df2f..1be9b27 100644 --- a/morss/feeds.py +++ b/morss/feeds.py @@ -759,8 +759,8 @@ class ItemJSON(Item, ParserJSON): if __name__ == '__main__': from . import crawler - data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss') - feed = parse(data, url=con.geturl(), encoding=encoding) + req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://www.nytimes.com/', follow='rss') + feed = parse(req['data'], url=req['url'], encoding=req['encoding']) if not sys.flags.interactive: for item in feed.items: diff --git a/morss/morss.py b/morss/morss.py index e6b25a1..83990ec 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -248,17 +248,17 @@ def ItemFill(item, options, feedurl='/', fast=False): delay = -2 try: - data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) + req = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) except (IOError, HTTPException) as e: log('http error') return False # let's just delete errors stuff when in cache mode - if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain': + if req['contenttype'] not in crawler.MIMETYPE['html'] and req['contenttype'] != 'text/plain': log('non-text page') return True - out = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') + out = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode') if out is not None: item.content = out @@ -303,14 +303,14 @@ def FeedFetch(url, options): delay = 0 try: - xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2) + req = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2) except (IOError, HTTPException): raise MorssException('Error downloading feed') if options.items: # using custom rules - rss = feeds.FeedHTML(xml, encoding=encoding) + rss = feeds.FeedHTML(req['data'], encoding=req['encoding']) rss.rules['title'] = options.title if options.title else '//head/title' rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content' @@ -330,13 +330,13 @@ def FeedFetch(url, options): else: try: - rss = feeds.parse(xml, url, encoding=encoding) + rss = feeds.parse(req['data'], url=url, encoding=req['encoding']) rss = rss.convert(feeds.FeedXML) # contains all fields, otherwise much-needed data can be lost except TypeError: log('random page') - log(contenttype) + log(req['contenttype']) raise MorssException('Link provided is not a valid feed') return rss @@ -594,12 +594,12 @@ def cgi_get(environ, start_response): url, options = cgi_parse_environ(environ) # get page - data, con, contenttype, encoding = crawler.adv_get(url=url, timeout=TIMEOUT) + req = crawler.adv_get(url=url, timeout=TIMEOUT) - if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: + if req['contenttype'] in ['text/html', 'application/xhtml+xml', 'application/xml']: if options.get == 'page': - html = readabilite.parse(data, encoding=encoding) - html.make_links_absolute(con.geturl()) + html = readabilite.parse(req['data'], encoding=req['encoding']) + html.make_links_absolute(req['url']) kill_tags = ['script', 'iframe', 'noscript'] @@ -610,13 +610,13 @@ def cgi_get(environ, start_response): output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') elif options.get == 'article': - output = readabilite.get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='utf-8', debug=options.debug) + output = readabilite.get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='utf-8', debug=options.debug) else: raise MorssException('no :get option passed') else: - output = data + output = req['data'] # return html page headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'} diff --git a/morss/readabilite.py b/morss/readabilite.py index e14f88b..a4514b6 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -348,8 +348,8 @@ if __name__ == '__main__': import sys from . import crawler - data, con, contenttype, encoding = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') - article = get_article(data, url=con.geturl(), encoding_in=encoding, encoding_out='unicode') + req = crawler.adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') + article = get_article(req['data'], url=req['url'], encoding_in=req['encoding'], encoding_out='unicode') if not sys.flags.interactive: print(article)