Use wrapper for http calls
This commit is contained in:
		@@ -34,6 +34,25 @@ MIMETYPE = {
 | 
			
		||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get(*args, **kwargs):
 | 
			
		||||
    return adv_get(*args, **kwargs)[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def adv_get(url, timeout=None, *args, **kwargs):
 | 
			
		||||
    if timeout is None:
 | 
			
		||||
        con = custom_handler(*args, **kwargs).open(url)
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
 | 
			
		||||
 | 
			
		||||
    data = con.read()
 | 
			
		||||
 | 
			
		||||
    contenttype = con.info().get('Content-Type', '').split(';')[0]
 | 
			
		||||
    encoding= detect_encoding(data, con)
 | 
			
		||||
 | 
			
		||||
    return data, con, contenttype, encoding
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def custom_handler(follow=None, delay=None, encoding=None):
 | 
			
		||||
    handlers = []
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -251,14 +251,12 @@ def ItemFill(item, options, feedurl='/', fast=False):
 | 
			
		||||
        delay = -2
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
 | 
			
		||||
        data = con.read()
 | 
			
		||||
        data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
 | 
			
		||||
 | 
			
		||||
    except (IOError, HTTPException) as e:
 | 
			
		||||
        log('http error')
 | 
			
		||||
        return False # let's just delete errors stuff when in cache mode
 | 
			
		||||
 | 
			
		||||
    contenttype = con.info().get('Content-Type', '').split(';')[0]
 | 
			
		||||
    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
 | 
			
		||||
        log('non-text page')
 | 
			
		||||
        return True
 | 
			
		||||
@@ -324,15 +322,11 @@ def FeedFetch(url, options):
 | 
			
		||||
        delay = 0
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
 | 
			
		||||
            .open(url, timeout=TIMEOUT * 2)
 | 
			
		||||
        xml = con.read()
 | 
			
		||||
        xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
 | 
			
		||||
 | 
			
		||||
    except (IOError, HTTPException):
 | 
			
		||||
        raise MorssException('Error downloading feed')
 | 
			
		||||
 | 
			
		||||
    contenttype = con.info().get('Content-Type', '').split(';')[0]
 | 
			
		||||
 | 
			
		||||
    if options.items:
 | 
			
		||||
        # using custom rules
 | 
			
		||||
        rss = feeds.FeedHTML(xml)
 | 
			
		||||
@@ -652,10 +646,7 @@ def cgi_page(environ, start_response):
 | 
			
		||||
    if urlparse(url).scheme not in ['http', 'https']:
 | 
			
		||||
        url = 'http://' + url
 | 
			
		||||
 | 
			
		||||
    con = crawler.custom_handler().open(url)
 | 
			
		||||
    data = con.read()
 | 
			
		||||
 | 
			
		||||
    contenttype = con.info().get('Content-Type', '').split(';')[0]
 | 
			
		||||
    data, con, contenttype, encoding = crawler.adv_get(url=url)
 | 
			
		||||
 | 
			
		||||
    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
 | 
			
		||||
        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user