Use HTTP Accept headers

Tries to tell servers what we need, hopefull this will avoid the useless donwloading of pdf, images and so on (especially on Hacker News)
2013-11-30 19:33:36 +01:00
parent 0de5adc505
commit 2634dab40c
1 changed files with 34 additions and 3 deletions
--- a/morss.py
+++ b/morss.py
@@ -200,13 +200,15 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
 	to save bandwidth. The given headers are added back into the header on error
 	304 for easier use.
 	"""
-	def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None):
+	def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None, accept=None, strict=False):
 		urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
 		self.cache = cache
 		self.etag = etag
 		self.lastmodified = lastmodified
 		self.useragent = useragent
 		self.decode = decode
 		self.accept = accept
 		self.strict = strict
 	def http_request(self, req):
 		urllib2.HTTPCookieProcessor.http_request(self, req)
@@ -220,6 +222,35 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
 				req.add_unredirected_header('If-None-Match', self.etag)
 			if self.lastmodified:
 				req.add_unredirected_header('If-Modified-Since', self.lastmodified)
 		if self.accept is not None:
 			# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
 			if isinstance(self.accept, basestring):
 				self.accept = (self.accept,)
 			out = {}
 			for (i, group) in enumerate(self.accept):
 				rank = 1 - i*0.1
 				if isinstance(group, basestring):
 					if group in MIMETYPE:
 						group = MIMETYPE[group]
 					else:
 						out[group] = rank
 						continue
 				for mime in group:
 					if mime not in out:
 						out[mime] = rank
 			if not self.strict:
 				out['*/*'] = rank-0.1
 			string = ','.join([x+';q={:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
 			log(string)
 			req.add_unredirected_header('Accept', string)
 		return req
 	def http_error_304(self, req, fp, code, msg, headers):
@@ -408,7 +439,7 @@ def Fill(item, cache, feedurl='/', fast=False):
 	# download
 	try:
 		url = link.encode('utf-8')
-		con = urllib2.build_opener(SimpleDownload(decode=True)).open(url, timeout=TIMEOUT)
+		con = urllib2.build_opener(SimpleDownload(decode=True, accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
 		data = con.read()
 	except (IOError, httplib.HTTPException):
 		log('http error')
@@ -459,7 +490,7 @@ def Gather(url, cachePath, options):
 		style = cache.get('style')
 	else:
 		try:
-			opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False)
+			opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False, accept=('xml','html'))
 			con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
 			xml = con.read()
 		except (IOError, httplib.HTTPException):