Use HTTP Accept headers

Tries to tell servers what we need, hopefull this will avoid the useless donwloading of pdf, images and so on (especially on Hacker News)
master
pictuga 2013-11-30 19:33:36 +01:00
parent 0de5adc505
commit 2634dab40c
1 changed files with 34 additions and 3 deletions

View File

@ -200,13 +200,15 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
to save bandwidth. The given headers are added back into the header on error to save bandwidth. The given headers are added back into the header on error
304 for easier use. 304 for easier use.
""" """
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None): def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None, accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar) urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache self.cache = cache
self.etag = etag self.etag = etag
self.lastmodified = lastmodified self.lastmodified = lastmodified
self.useragent = useragent self.useragent = useragent
self.decode = decode self.decode = decode
self.accept = accept
self.strict = strict
def http_request(self, req): def http_request(self, req):
urllib2.HTTPCookieProcessor.http_request(self, req) urllib2.HTTPCookieProcessor.http_request(self, req)
@ -220,6 +222,35 @@ class SimpleDownload(urllib2.HTTPCookieProcessor):
req.add_unredirected_header('If-None-Match', self.etag) req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified: if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified) req.add_unredirected_header('If-Modified-Since', self.lastmodified)
if self.accept is not None:
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
out = {}
for (i, group) in enumerate(self.accept):
rank = 1 - i*0.1
if isinstance(group, basestring):
if group in MIMETYPE:
group = MIMETYPE[group]
else:
out[group] = rank
continue
for mime in group:
if mime not in out:
out[mime] = rank
if not self.strict:
out['*/*'] = rank-0.1
string = ','.join([x+';q={:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
log(string)
req.add_unredirected_header('Accept', string)
return req return req
def http_error_304(self, req, fp, code, msg, headers): def http_error_304(self, req, fp, code, msg, headers):
@ -408,7 +439,7 @@ def Fill(item, cache, feedurl='/', fast=False):
# download # download
try: try:
url = link.encode('utf-8') url = link.encode('utf-8')
con = urllib2.build_opener(SimpleDownload(decode=True)).open(url, timeout=TIMEOUT) con = urllib2.build_opener(SimpleDownload(decode=True, accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
data = con.read() data = con.read()
except (IOError, httplib.HTTPException): except (IOError, httplib.HTTPException):
log('http error') log('http error')
@ -459,7 +490,7 @@ def Gather(url, cachePath, options):
style = cache.get('style') style = cache.get('style')
else: else:
try: try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False) opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False, accept=('xml','html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT) con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
xml = con.read() xml = con.read()
except (IOError, httplib.HTTPException): except (IOError, httplib.HTTPException):