crawler: separate CacheHander and actual caching
Default cache is now just an in-memory {}master
parent
523b250907
commit
194465544a
|
@ -9,7 +9,6 @@ import re
|
||||||
import chardet
|
import chardet
|
||||||
from cgi import parse_header
|
from cgi import parse_header
|
||||||
import lxml.html
|
import lxml.html
|
||||||
import sqlite3
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -61,7 +60,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
|
||||||
if accept:
|
if accept:
|
||||||
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
||||||
|
|
||||||
handlers.append(SQliteCacheHandler(delay))
|
handlers.append(CacheHandler(force_min=delay))
|
||||||
|
|
||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
@ -311,42 +310,37 @@ class HTTPRefreshHandler(BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
class BaseCacheHandler(BaseHandler):
|
default_cache = {}
|
||||||
" Cache based on etags/last-modified. Inherit from this to implement actual storage "
|
|
||||||
|
|
||||||
|
class CacheHandler(BaseHandler):
|
||||||
|
" Cache based on etags/last-modified "
|
||||||
|
|
||||||
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
|
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
|
||||||
handler_order = 499
|
handler_order = 499
|
||||||
|
|
||||||
def __init__(self, force_min=None):
|
def __init__(self, cache=None, force_min=None):
|
||||||
|
self.cache = cache or default_cache
|
||||||
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
|
self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
|
||||||
|
|
||||||
def _load(self, url):
|
def load(self, url):
|
||||||
out = list(self.load(url))
|
try:
|
||||||
|
out = list(self.cache[url])
|
||||||
|
except KeyError:
|
||||||
|
out = [None, None, unicode(), bytes(), 0]
|
||||||
|
|
||||||
if sys.version_info[0] >= 3:
|
if sys.version_info[0] >= 3:
|
||||||
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
out[2] = email.message_from_string(out[2] or unicode()) # headers
|
||||||
else:
|
else:
|
||||||
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
out[2] = mimetools.Message(StringIO(out[2] or unicode()))
|
||||||
|
|
||||||
out[3] = out[3] or bytes() # data
|
|
||||||
out[4] = out[4] or 0 # timestamp
|
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def load(self, url):
|
|
||||||
" Return the basic vars (code, msg, headers, data, timestamp) "
|
|
||||||
return (None, None, None, None, None)
|
|
||||||
|
|
||||||
def _save(self, url, code, msg, headers, data, timestamp):
|
|
||||||
headers = unicode(headers)
|
|
||||||
self.save(url, code, msg, headers, data, timestamp)
|
|
||||||
|
|
||||||
def save(self, url, code, msg, headers, data, timestamp):
|
def save(self, url, code, msg, headers, data, timestamp):
|
||||||
" Save values to disk "
|
self.cache[url] = (code, msg, unicode(headers), buffer(data), timestamp)
|
||||||
pass
|
|
||||||
|
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
(code, msg, headers, data, timestamp) = self._load(req.get_full_url())
|
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||||
|
|
||||||
if 'etag' in headers:
|
if 'etag' in headers:
|
||||||
req.add_unredirected_header('If-None-Match', headers['etag'])
|
req.add_unredirected_header('If-None-Match', headers['etag'])
|
||||||
|
@ -357,7 +351,7 @@ class BaseCacheHandler(BaseHandler):
|
||||||
return req
|
return req
|
||||||
|
|
||||||
def http_open(self, req):
|
def http_open(self, req):
|
||||||
(code, msg, headers, data, timestamp) = self._load(req.get_full_url())
|
(code, msg, headers, data, timestamp) = self.load(req.get_full_url())
|
||||||
|
|
||||||
# some info needed to process everything
|
# some info needed to process everything
|
||||||
cache_control = parse_http_list(headers.get('cache-control', ()))
|
cache_control = parse_http_list(headers.get('cache-control', ()))
|
||||||
|
@ -448,7 +442,7 @@ class BaseCacheHandler(BaseHandler):
|
||||||
|
|
||||||
# save to disk
|
# save to disk
|
||||||
data = resp.read()
|
data = resp.read()
|
||||||
self._save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
|
||||||
|
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
old_resp = resp
|
old_resp = resp
|
||||||
|
@ -458,11 +452,11 @@ class BaseCacheHandler(BaseHandler):
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
def http_error_304(self, req, fp, code, msg, headers):
|
def http_error_304(self, req, fp, code, msg, headers):
|
||||||
cache = list(self._load(req.get_full_url()))
|
cache = list(self.load(req.get_full_url()))
|
||||||
|
|
||||||
if cache[0]:
|
if cache[0]:
|
||||||
cache[-1] = time.time()
|
cache[-1] = time.time()
|
||||||
self._save(req.get_full_url(), *cache)
|
self.save(req.get_full_url(), *cache)
|
||||||
|
|
||||||
new = Request(req.get_full_url(),
|
new = Request(req.get_full_url(),
|
||||||
headers=req.headers,
|
headers=req.headers,
|
||||||
|
@ -479,13 +473,11 @@ class BaseCacheHandler(BaseHandler):
|
||||||
https_response = http_response
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
sqlite_default = ':memory:'
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
class SQliteCacheHandler(BaseCacheHandler):
|
class SQLiteCache:
|
||||||
def __init__(self, force_min=-1, filename=None):
|
def __init__(self, filename=':memory:'):
|
||||||
BaseCacheHandler.__init__(self, force_min)
|
|
||||||
|
|
||||||
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
|
||||||
|
|
||||||
with self.con:
|
with self.con:
|
||||||
|
@ -499,20 +491,18 @@ class SQliteCacheHandler(BaseCacheHandler):
|
||||||
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
return (None, None, None, None, None)
|
raise KeyError
|
||||||
|
|
||||||
return row[1:]
|
return row[1:]
|
||||||
|
|
||||||
def save(self, url, code, msg, headers, data, timestamp):
|
def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
|
||||||
data = buffer(data)
|
|
||||||
|
|
||||||
if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone():
|
if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone():
|
||||||
with self.con:
|
with self.con:
|
||||||
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
|
self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
|
||||||
(code, msg, headers, data, timestamp, url))
|
value + (url,))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
with self.con:
|
with self.con:
|
||||||
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url, code, msg, headers, data, timestamp))
|
self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -483,7 +483,7 @@ def process(url, cache=None, options=None):
|
||||||
options = Options(options)
|
options = Options(options)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
crawler.sqlite_default = cache
|
crawler.default_cache = crawler.SQLiteCache(cache)
|
||||||
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
@ -544,7 +544,7 @@ def cgi_app(environ, start_response):
|
||||||
else:
|
else:
|
||||||
headers['content-type'] = 'text/xml'
|
headers['content-type'] = 'text/xml'
|
||||||
|
|
||||||
crawler.sqlite_default = os.path.join(os.getcwd(), 'morss-cache.db')
|
crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
|
||||||
|
|
||||||
# get the work done
|
# get the work done
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
|
@ -618,7 +618,7 @@ def cli_app():
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = options.debug
|
DEBUG = options.debug
|
||||||
|
|
||||||
crawler.sqlite_default = os.path.expanduser('~/.cache/morss-cache.db')
|
crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
|
||||||
|
|
||||||
rss = FeedFetch(url, options)
|
rss = FeedFetch(url, options)
|
||||||
rss = FeedGather(rss, url, options)
|
rss = FeedGather(rss, url, options)
|
||||||
|
|
Loading…
Reference in New Issue