Use wrapper for http calls

2020-04-07 10:30:17 +02:00
parent 0ae0dbc175
commit 7691df5257
2 changed files with 22 additions and 12 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -34,6 +34,25 @@ MIMETYPE = {
 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'


+def get(*args, **kwargs):
+    return adv_get(*args, **kwargs)[0]
+
+
+def adv_get(url, timeout=None, *args, **kwargs):
+    if timeout is None:
+        con = custom_handler(*args, **kwargs).open(url)
+
+    else:
+        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
+
+    data = con.read()
+
+    contenttype = con.info().get('Content-Type', '').split(';')[0]
+    encoding= detect_encoding(data, con)
+
+    return data, con, contenttype, encoding
+
+
 def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

--- a/morss/morss.py
+++ b/morss/morss.py
@@ -251,14 +251,12 @@ def ItemFill(item, options, feedurl='/', fast=False):
        delay = -2

    try:
-        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
-        data = con.read()
+        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)

    except (IOError, HTTPException) as e:
        log('http error')
        return False # let's just delete errors stuff when in cache mode

-    contenttype = con.info().get('Content-Type', '').split(';')[0]
    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
        log('non-text page')
        return True
@@ -324,15 +322,11 @@ def FeedFetch(url, options):
        delay = 0

    try:
-        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
-            .open(url, timeout=TIMEOUT * 2)
-        xml = con.read()
+        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)

    except (IOError, HTTPException):
        raise MorssException('Error downloading feed')

-    contenttype = con.info().get('Content-Type', '').split(';')[0]
-
    if options.items:
        # using custom rules
        rss = feeds.FeedHTML(xml)
@@ -652,10 +646,7 @@ def cgi_page(environ, start_response):
    if urlparse(url).scheme not in ['http', 'https']:
        url = 'http://' + url

-    con = crawler.custom_handler().open(url)
-    data = con.read()
-
-    contenttype = con.info().get('Content-Type', '').split(';')[0]
+    data, con, contenttype, encoding = crawler.adv_get(url=url)

    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())