From 7691df52579eab797c2e02e667cb9977958e4e48 Mon Sep 17 00:00:00 2001 From: pictuga Date: Tue, 7 Apr 2020 10:30:17 +0200 Subject: [PATCH] Use wrapper for http calls --- morss/crawler.py | 19 +++++++++++++++++++ morss/morss.py | 15 +++------------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 8b1254f..b9388a0 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -34,6 +34,25 @@ MIMETYPE = { DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' +def get(*args, **kwargs): + return adv_get(*args, **kwargs)[0] + + +def adv_get(url, timeout=None, *args, **kwargs): + if timeout is None: + con = custom_handler(*args, **kwargs).open(url) + + else: + con = custom_handler(*args, **kwargs).open(url, timeout=timeout) + + data = con.read() + + contenttype = con.info().get('Content-Type', '').split(';')[0] + encoding= detect_encoding(data, con) + + return data, con, contenttype, encoding + + def custom_handler(follow=None, delay=None, encoding=None): handlers = [] diff --git a/morss/morss.py b/morss/morss.py index b4234a7..19973a0 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -251,14 +251,12 @@ def ItemFill(item, options, feedurl='/', fast=False): delay = -2 try: - con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT) - data = con.read() + data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) except (IOError, HTTPException) as e: log('http error') return False # let's just delete errors stuff when in cache mode - contenttype = con.info().get('Content-Type', '').split(';')[0] if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain': log('non-text page') return True @@ -324,15 +322,11 @@ def FeedFetch(url, options): delay = 0 try: - con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \ - .open(url, timeout=TIMEOUT * 2) - xml = con.read() + xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2) except (IOError, HTTPException): raise MorssException('Error downloading feed') - contenttype = con.info().get('Content-Type', '').split(';')[0] - if options.items: # using custom rules rss = feeds.FeedHTML(xml) @@ -652,10 +646,7 @@ def cgi_page(environ, start_response): if urlparse(url).scheme not in ['http', 'https']: url = 'http://' + url - con = crawler.custom_handler().open(url) - data = con.read() - - contenttype = con.info().get('Content-Type', '').split(';')[0] + data, con, contenttype, encoding = crawler.adv_get(url=url) if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())