crawler: separate CacheHander and actual caching

Default cache is now just an in-memory {}
master
pictuga 2017-11-04 12:41:56 +01:00
parent 523b250907
commit 194465544a
2 changed files with 29 additions and 39 deletions

View File

@ -9,7 +9,6 @@ import re
import chardet import chardet
from cgi import parse_header from cgi import parse_header
import lxml.html import lxml.html
import sqlite3
import time import time
try: try:
@ -61,7 +60,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
if accept: if accept:
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict)) handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
handlers.append(SQliteCacheHandler(delay)) handlers.append(CacheHandler(force_min=delay))
return build_opener(*handlers) return build_opener(*handlers)
@ -311,42 +310,37 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response https_response = http_response
class BaseCacheHandler(BaseHandler): default_cache = {}
" Cache based on etags/last-modified. Inherit from this to implement actual storage "
class CacheHandler(BaseHandler):
" Cache based on etags/last-modified "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
handler_order = 499 handler_order = 499
def __init__(self, force_min=None): def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
def _load(self, url): def load(self, url):
out = list(self.load(url)) try:
out = list(self.cache[url])
except KeyError:
out = [None, None, unicode(), bytes(), 0]
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
out[2] = email.message_from_string(out[2] or unicode()) # headers out[2] = email.message_from_string(out[2] or unicode()) # headers
else: else:
out[2] = mimetools.Message(StringIO(out[2] or unicode())) out[2] = mimetools.Message(StringIO(out[2] or unicode()))
out[3] = out[3] or bytes() # data
out[4] = out[4] or 0 # timestamp
return out return out
def load(self, url):
" Return the basic vars (code, msg, headers, data, timestamp) "
return (None, None, None, None, None)
def _save(self, url, code, msg, headers, data, timestamp):
headers = unicode(headers)
self.save(url, code, msg, headers, data, timestamp)
def save(self, url, code, msg, headers, data, timestamp): def save(self, url, code, msg, headers, data, timestamp):
" Save values to disk " self.cache[url] = (code, msg, unicode(headers), buffer(data), timestamp)
pass
def http_request(self, req): def http_request(self, req):
(code, msg, headers, data, timestamp) = self._load(req.get_full_url()) (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
if 'etag' in headers: if 'etag' in headers:
req.add_unredirected_header('If-None-Match', headers['etag']) req.add_unredirected_header('If-None-Match', headers['etag'])
@ -357,7 +351,7 @@ class BaseCacheHandler(BaseHandler):
return req return req
def http_open(self, req): def http_open(self, req):
(code, msg, headers, data, timestamp) = self._load(req.get_full_url()) (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
# some info needed to process everything # some info needed to process everything
cache_control = parse_http_list(headers.get('cache-control', ())) cache_control = parse_http_list(headers.get('cache-control', ()))
@ -448,7 +442,7 @@ class BaseCacheHandler(BaseHandler):
# save to disk # save to disk
data = resp.read() data = resp.read()
self._save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time()) self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
fp = BytesIO(data) fp = BytesIO(data)
old_resp = resp old_resp = resp
@ -458,11 +452,11 @@ class BaseCacheHandler(BaseHandler):
return resp return resp
def http_error_304(self, req, fp, code, msg, headers): def http_error_304(self, req, fp, code, msg, headers):
cache = list(self._load(req.get_full_url())) cache = list(self.load(req.get_full_url()))
if cache[0]: if cache[0]:
cache[-1] = time.time() cache[-1] = time.time()
self._save(req.get_full_url(), *cache) self.save(req.get_full_url(), *cache)
new = Request(req.get_full_url(), new = Request(req.get_full_url(),
headers=req.headers, headers=req.headers,
@ -479,13 +473,11 @@ class BaseCacheHandler(BaseHandler):
https_response = http_response https_response = http_response
sqlite_default = ':memory:' import sqlite3
class SQliteCacheHandler(BaseCacheHandler): class SQLiteCache:
def __init__(self, force_min=-1, filename=None): def __init__(self, filename=':memory:'):
BaseCacheHandler.__init__(self, force_min)
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False) self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con: with self.con:
@ -499,20 +491,18 @@ class SQliteCacheHandler(BaseCacheHandler):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone() row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
if not row: if not row:
return (None, None, None, None, None) raise KeyError
return row[1:] return row[1:]
def save(self, url, code, msg, headers, data, timestamp): def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
data = buffer(data)
if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone(): if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone():
with self.con: with self.con:
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?', self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
(code, msg, headers, data, timestamp, url)) value + (url,))
else: else:
with self.con: with self.con:
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url, code, msg, headers, data, timestamp)) self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)

View File

@ -483,7 +483,7 @@ def process(url, cache=None, options=None):
options = Options(options) options = Options(options)
if cache: if cache:
crawler.sqlite_default = cache crawler.default_cache = crawler.SQLiteCache(cache)
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)
@ -544,7 +544,7 @@ def cgi_app(environ, start_response):
else: else:
headers['content-type'] = 'text/xml' headers['content-type'] = 'text/xml'
crawler.sqlite_default = os.path.join(os.getcwd(), 'morss-cache.db') crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
# get the work done # get the work done
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
@ -618,7 +618,7 @@ def cli_app():
global DEBUG global DEBUG
DEBUG = options.debug DEBUG = options.debug
crawler.sqlite_default = os.path.expanduser('~/.cache/morss-cache.db') crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
rss = FeedFetch(url, options) rss = FeedFetch(url, options)
rss = FeedGather(rss, url, options) rss = FeedGather(rss, url, options)