Use wrapper for http calls
parent
0ae0dbc175
commit
7691df5257
|
@ -34,6 +34,25 @@ MIMETYPE = {
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
|
|
||||||
|
def get(*args, **kwargs):
|
||||||
|
return adv_get(*args, **kwargs)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
|
if timeout is None:
|
||||||
|
con = custom_handler(*args, **kwargs).open(url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
con = custom_handler(*args, **kwargs).open(url, timeout=timeout)
|
||||||
|
|
||||||
|
data = con.read()
|
||||||
|
|
||||||
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
|
encoding= detect_encoding(data, con)
|
||||||
|
|
||||||
|
return data, con, contenttype, encoding
|
||||||
|
|
||||||
|
|
||||||
def custom_handler(follow=None, delay=None, encoding=None):
|
def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
|
|
|
@ -251,14 +251,12 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT)
|
data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT)
|
||||||
data = con.read()
|
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
log('http error')
|
log('http error')
|
||||||
return False # let's just delete errors stuff when in cache mode
|
return False # let's just delete errors stuff when in cache mode
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
@ -324,15 +322,11 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \
|
xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2)
|
||||||
.open(url, timeout=TIMEOUT * 2)
|
|
||||||
xml = con.read()
|
|
||||||
|
|
||||||
except (IOError, HTTPException):
|
except (IOError, HTTPException):
|
||||||
raise MorssException('Error downloading feed')
|
raise MorssException('Error downloading feed')
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
|
|
||||||
if options.items:
|
if options.items:
|
||||||
# using custom rules
|
# using custom rules
|
||||||
rss = feeds.FeedHTML(xml)
|
rss = feeds.FeedHTML(xml)
|
||||||
|
@ -652,10 +646,7 @@ def cgi_page(environ, start_response):
|
||||||
if urlparse(url).scheme not in ['http', 'https']:
|
if urlparse(url).scheme not in ['http', 'https']:
|
||||||
url = 'http://' + url
|
url = 'http://' + url
|
||||||
|
|
||||||
con = crawler.custom_handler().open(url)
|
data, con, contenttype, encoding = crawler.adv_get(url=url)
|
||||||
data = con.read()
|
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
||||||
|
|
||||||
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
|
||||||
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
|
||||||
|
|
Loading…
Reference in New Issue