crawler: clean up caching code
continuous-integration/drone/push Build is passing Details

master
pictuga 2021-11-08 22:02:23 +01:00
parent c71cf5d5ce
commit cb21871c35
2 changed files with 115 additions and 78 deletions

View File

@ -108,7 +108,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
} }
def custom_opener(follow=None, delay=None): def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
handlers = [] handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
@ -142,7 +142,7 @@ def custom_opener(follow=None, delay=None):
if follow: if follow:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(force_min=delay)) handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
return build_opener(*handlers) return build_opener(*handlers)
@ -426,31 +426,50 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response https_response = http_response
def error_response(code, msg, url=''):
# return an error as a response
resp = addinfourl(BytesIO(), message_from_string('\n\n'), url, code)
resp.msg = msg
return resp
class CacheHandler(BaseHandler): class CacheHandler(BaseHandler):
" Cache based on etags/last-modified " " Cache based on etags/last-modified "
private_cache = False # Websites can indicate whether the page should be privacy = 'private' # Websites can indicate whether the page should be cached
# cached by CDNs (e.g. shouldn't be the case for # by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages. # private/confidential/user-specific pages. With this
# With this setting, decide whether (False) you want # setting, decide whether you want the cache to behave
# the cache to behave like a CDN (i.e. don't cache # like a CDN (i.e. don't cache private pages, 'public'),
# private pages), or (True) to behave like a end-cache # or to behave like a end-user private pages
# private pages. If unsure, False is the safest bet. # ('private'). If unsure, 'public' is the safest bet,
# but many websites abuse this feature...
# NB. This overrides all the other min/max/policy settings.
handler_order = 499 handler_order = 499
def __init__(self, cache=None, force_min=None): def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
self.cache = cache or default_cache self.cache = cache or default_cache
self.force_min = force_min self.force_min = force_min
# Servers indicate how long they think their content is "valid". self.force_max = force_max
# With this parameter (force_min, expressed in seconds), we can self.policy = policy # can be cached/refresh/offline/None (default)
# override the validity period (i.e. bypassing http headers)
# Special values: # Servers indicate how long they think their content is "valid". With
# -1: valid forever, i.e. use the cache no matter what (and fetch # this parameter (force_min/max, expressed in seconds), we can override
# the page online if not present in cache) # the validity period (i.e. bypassing http headers)
# 0: valid zero second, i.e. force refresh # Special choices, via "policy":
# -2: same as -1, i.e. use the cache no matter what, but do NOT # cached: use the cache no matter what (and fetch the page online if
# fetch the page online if not present in cache, throw an # not present in cache)
# refresh: valid zero second, i.e. force refresh
# offline: same as cached, i.e. use the cache no matter what, but do
# NOT fetch the page online if not present in cache, throw an
# error instead # error instead
# None: just follow protocols
# sanity checks
assert self.force_max is None or self.force_max >= 0
assert self.force_min is None or self.force_min >= 0
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
def load(self, url): def load(self, url):
try: try:
@ -468,26 +487,25 @@ class CacheHandler(BaseHandler):
data['headers'] = unicode(data['headers']) data['headers'] = unicode(data['headers'])
self.cache[key] = pickle.dumps(data, 0) self.cache[key] = pickle.dumps(data, 0)
def is_cached(self, key): def cached_response(self, req, fallback=None):
return self.load(key) is not None
def cached_response(self, req):
# this does NOT check whether it's already cached, use with care
data = self.load(req.get_full_url()) data = self.load(req.get_full_url())
if data is not None:
# return the cache as a response # return the cache as a response
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code']) resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
resp.msg = data['msg'] resp.msg = data['msg']
return resp return resp
else:
return fallback
def save_response(self, req, resp): def save_response(self, req, resp):
data = resp.read() data = resp.read()
self.save(req.get_full_url(), { self.save(req.get_full_url(), {
'code': resp.code, 'code': resp.code,
'msg': resp.msg, 'msg': resp.msg,
'headers': resp.headers, 'headers': str(resp.headers),
'data': data, 'data': data,
'timestamp': time.time() 'timestamp': time.time()
}) })
@ -516,12 +534,13 @@ class CacheHandler(BaseHandler):
# If 'None' is returned, try your chance with the next-available handler # If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response' # If a 'resp' is returned, stop there, and proceed with 'http_response'
# Here, we try to see whether we want to use data from cache (i.e.
# return 'resp'), or whether we want to refresh the content (return
# 'None')
data = self.load(req.get_full_url()) data = self.load(req.get_full_url())
if data is None: if data is not None:
# cache empty, refresh
return None
# some info needed to process everything # some info needed to process everything
cache_control = parse_http_list(data['headers'].get('cache-control', ())) cache_control = parse_http_list(data['headers'].get('cache-control', ()))
cache_control += parse_http_list(data['headers'].get('pragma', ())) cache_control += parse_http_list(data['headers'].get('pragma', ()))
@ -531,45 +550,58 @@ class CacheHandler(BaseHandler):
cache_age = time.time() - data['timestamp'] cache_age = time.time() - data['timestamp']
# list in a simple way what to do when # list in a simple way what to do in special cases
if self.force_min == -2:
if data['code'] is not None:
# already in cache, perfect, use cache
return self.cached_response(req)
else: if data is not None and 'private' in cc_list and self.privacy == 'public':
# raise an error, via urllib handlers # private data but public cache, do not use cache
resp = addinfourl(BytesIO(), data['headers'], req.get_full_url(), 409) # privacy concern, so handled first and foremost
resp.msg = 'Conflict' # (and doesn't need to be addressed anymore afterwards)
return resp return None
elif self.force_min == -1: elif self.policy == 'offline':
# force use cache # use cache, or return an error
return self.cached_response(req) return self.cached_response(
req,
error_response(409, 'Conflict', req.get_full_url())
)
elif self.force_min == 0: elif self.policy == 'cached':
# use cache, or fetch online
return self.cached_response(req, None)
elif self.policy == 'refresh':
# force refresh # force refresh
return None return None
elif data is None:
# we have already settled all the cases that don't need the cache.
# all the following ones need the cached item
return None
elif self.force_max is not None and cache_age > self.force_max:
# older than we want, refresh
return None
elif self.force_min is not None and cache_age < self.force_min:
# recent enough, use cache
return self.cached_response(req)
elif data['code'] == 301 and cache_age < 7*24*3600: elif data['code'] == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want # "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0 # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh) # if you want to bypass this (needed for a proper refresh)
return self.cached_response(req) return self.cached_response(req)
elif (self.force_min is None or self.force_min > 0) and ('no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache)): elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
# kindly follow web servers indications, refresh # kindly follow web servers indications, refresh if the same
# if the same settings are used all along, this section shouldn't be # settings are used all along, this section shouldn't be of any use,
# of any use, since the page woudln't be cached in the first place # since the page woudln't be cached in the first place the check is
# the check is only performed "just in case" # only performed "just in case"
# NB. NOT respected if force_min is set
return None return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age: elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
# server says it's still fine (and we trust him, if not, use force_min=0), use cache # server says it's still fine (and we trust him, if not, use overrides), use cache
return self.cached_response(req)
elif self.force_min is not None and self.force_min > cache_age:
# still recent enough for us, use cache
return self.cached_response(req) return self.cached_response(req)
else: else:
@ -580,19 +612,19 @@ class CacheHandler(BaseHandler):
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will) # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
# NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life # NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
if resp.code == 304 and self.is_cached(resp.url): if resp.code == 304 and resp.url in self.cache:
# we are hopefully the first after the HTTP handler, so no need # we are hopefully the first after the HTTP handler, so no need
# to re-run all the *_response # to re-run all the *_response
# here: cached page, returning from cache # here: cached page, returning from cache
return self.cached_response(req) return self.cached_response(req)
elif ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None: elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
cache_control = parse_http_list(resp.headers.get('cache-control', ())) cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ())) cache_control += parse_http_list(resp.headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache): if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
# kindly follow web servers indications (do not save & return) # kindly follow web servers indications (do not save & return)
return resp return resp

View File

@ -194,21 +194,20 @@ def ItemFill(item, options, feedurl='/', fast=False):
log(item.link) log(item.link)
# download # download
delay = -1
if fast or options.fast: if fast or options.cache:
# force cache, don't fetch # force cache, don't fetch
delay = -2 policy = 'offline'
elif options.force: elif options.force:
# force refresh # force refresh
delay = 0 policy = 'refresh'
else: else:
delay = 24*60*60 # 24h policy = None
try: try:
req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT) req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT)
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
log('http error') log('http error')
@ -266,11 +265,17 @@ def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
if options.force: if options.cache:
delay = 0 policy = 'offline'
elif options.force:
policy = 'refresh'
else:
policy = None
try: try:
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2) req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT * 2)
except (IOError, HTTPException): except (IOError, HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')