crawler: separate CacheHander and actual caching
Default cache is now just an in-memory {}
			
			
This commit is contained in:
		@@ -9,7 +9,6 @@ import re
 | 
				
			|||||||
import chardet
 | 
					import chardet
 | 
				
			||||||
from cgi import parse_header
 | 
					from cgi import parse_header
 | 
				
			||||||
import lxml.html
 | 
					import lxml.html
 | 
				
			||||||
import sqlite3
 | 
					 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
@@ -61,7 +60,7 @@ def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=F
 | 
				
			|||||||
    if accept:
 | 
					    if accept:
 | 
				
			||||||
        handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
 | 
					        handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    handlers.append(SQliteCacheHandler(delay))
 | 
					    handlers.append(CacheHandler(force_min=delay))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return build_opener(*handlers)
 | 
					    return build_opener(*handlers)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -311,42 +310,37 @@ class HTTPRefreshHandler(BaseHandler):
 | 
				
			|||||||
    https_response = http_response
 | 
					    https_response = http_response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BaseCacheHandler(BaseHandler):
 | 
					default_cache = {}
 | 
				
			||||||
    " Cache based on etags/last-modified. Inherit from this to implement actual storage "
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CacheHandler(BaseHandler):
 | 
				
			||||||
 | 
					    " Cache based on etags/last-modified "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
 | 
					    private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
 | 
				
			||||||
    handler_order = 499
 | 
					    handler_order = 499
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, force_min=None):
 | 
					    def __init__(self, cache=None, force_min=None):
 | 
				
			||||||
 | 
					        self.cache = cache or default_cache
 | 
				
			||||||
        self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
 | 
					        self.force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _load(self, url):
 | 
					    def load(self, url):
 | 
				
			||||||
        out = list(self.load(url))
 | 
					        try:
 | 
				
			||||||
 | 
					            out = list(self.cache[url])
 | 
				
			||||||
 | 
					        except KeyError:
 | 
				
			||||||
 | 
					            out = [None, None, unicode(), bytes(), 0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if sys.version_info[0] >= 3:
 | 
					        if sys.version_info[0] >= 3:
 | 
				
			||||||
            out[2] = email.message_from_string(out[2] or unicode()) # headers
 | 
					            out[2] = email.message_from_string(out[2] or unicode()) # headers
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            out[2] = mimetools.Message(StringIO(out[2] or unicode()))
 | 
					            out[2] = mimetools.Message(StringIO(out[2] or unicode()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        out[3] = out[3] or bytes() # data
 | 
					 | 
				
			||||||
        out[4] = out[4] or 0 # timestamp
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return out
 | 
					        return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load(self, url):
 | 
					 | 
				
			||||||
        " Return the basic vars (code, msg, headers, data, timestamp) "
 | 
					 | 
				
			||||||
        return (None, None, None, None, None)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _save(self, url, code, msg, headers, data, timestamp):
 | 
					 | 
				
			||||||
        headers = unicode(headers)
 | 
					 | 
				
			||||||
        self.save(url, code, msg, headers, data, timestamp)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def save(self, url, code, msg, headers, data, timestamp):
 | 
					    def save(self, url, code, msg, headers, data, timestamp):
 | 
				
			||||||
        " Save values to disk "
 | 
					        self.cache[url] = (code, msg, unicode(headers), buffer(data), timestamp)
 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def http_request(self, req):
 | 
					    def http_request(self, req):
 | 
				
			||||||
        (code, msg, headers, data, timestamp) = self._load(req.get_full_url())
 | 
					        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if 'etag' in headers:
 | 
					        if 'etag' in headers:
 | 
				
			||||||
            req.add_unredirected_header('If-None-Match', headers['etag'])
 | 
					            req.add_unredirected_header('If-None-Match', headers['etag'])
 | 
				
			||||||
@@ -357,7 +351,7 @@ class BaseCacheHandler(BaseHandler):
 | 
				
			|||||||
        return req
 | 
					        return req
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def http_open(self, req):
 | 
					    def http_open(self, req):
 | 
				
			||||||
        (code, msg, headers, data, timestamp) = self._load(req.get_full_url())
 | 
					        (code, msg, headers, data, timestamp) = self.load(req.get_full_url())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # some info needed to process everything
 | 
					        # some info needed to process everything
 | 
				
			||||||
        cache_control = parse_http_list(headers.get('cache-control', ()))
 | 
					        cache_control = parse_http_list(headers.get('cache-control', ()))
 | 
				
			||||||
@@ -448,7 +442,7 @@ class BaseCacheHandler(BaseHandler):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # save to disk
 | 
					        # save to disk
 | 
				
			||||||
        data = resp.read()
 | 
					        data = resp.read()
 | 
				
			||||||
        self._save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
 | 
					        self.save(req.get_full_url(), resp.code, resp.msg, resp.headers, data, time.time())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        fp = BytesIO(data)
 | 
					        fp = BytesIO(data)
 | 
				
			||||||
        old_resp = resp
 | 
					        old_resp = resp
 | 
				
			||||||
@@ -458,11 +452,11 @@ class BaseCacheHandler(BaseHandler):
 | 
				
			|||||||
        return resp
 | 
					        return resp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def http_error_304(self, req, fp, code, msg, headers):
 | 
					    def http_error_304(self, req, fp, code, msg, headers):
 | 
				
			||||||
        cache = list(self._load(req.get_full_url()))
 | 
					        cache = list(self.load(req.get_full_url()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if cache[0]:
 | 
					        if cache[0]:
 | 
				
			||||||
            cache[-1] = time.time()
 | 
					            cache[-1] = time.time()
 | 
				
			||||||
            self._save(req.get_full_url(), *cache)
 | 
					            self.save(req.get_full_url(), *cache)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            new = Request(req.get_full_url(),
 | 
					            new = Request(req.get_full_url(),
 | 
				
			||||||
                           headers=req.headers,
 | 
					                           headers=req.headers,
 | 
				
			||||||
@@ -479,13 +473,11 @@ class BaseCacheHandler(BaseHandler):
 | 
				
			|||||||
    https_response = http_response
 | 
					    https_response = http_response
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
sqlite_default = ':memory:'
 | 
					import sqlite3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SQliteCacheHandler(BaseCacheHandler):
 | 
					class SQLiteCache:
 | 
				
			||||||
    def __init__(self, force_min=-1, filename=None):
 | 
					    def __init__(self, filename=':memory:'):
 | 
				
			||||||
        BaseCacheHandler.__init__(self, force_min)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
 | 
					        self.con = sqlite3.connect(filename or sqlite_default, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with self.con:
 | 
					        with self.con:
 | 
				
			||||||
@@ -499,20 +491,18 @@ class SQliteCacheHandler(BaseCacheHandler):
 | 
				
			|||||||
        row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
 | 
					        row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not row:
 | 
					        if not row:
 | 
				
			||||||
            return (None, None, None, None, None)
 | 
					            raise KeyError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return row[1:]
 | 
					        return row[1:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def save(self, url, code, msg, headers, data, timestamp):
 | 
					    def __setitem__(self, url, value): # value = (code, msg, headers, data, timestamp)
 | 
				
			||||||
        data = buffer(data)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone():
 | 
					        if self.con.execute('SELECT code FROM data WHERE url=?', (url,)).fetchone():
 | 
				
			||||||
            with self.con:
 | 
					            with self.con:
 | 
				
			||||||
                self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
 | 
					                self.con.execute('UPDATE data SET code=?, msg=?, headers=?, data=?, timestamp=? WHERE url=?',
 | 
				
			||||||
                    (code, msg, headers, data, timestamp, url))
 | 
					                    value + (url,))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            with self.con:
 | 
					            with self.con:
 | 
				
			||||||
                self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url, code, msg, headers, data, timestamp))
 | 
					                self.con.execute('INSERT INTO data VALUES (?,?,?,?,?,?)', (url,) + value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -483,7 +483,7 @@ def process(url, cache=None, options=None):
 | 
				
			|||||||
    options = Options(options)
 | 
					    options = Options(options)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if cache:
 | 
					    if cache:
 | 
				
			||||||
        crawler.sqlite_default = cache
 | 
					        crawler.default_cache = crawler.SQLiteCache(cache)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    rss = FeedFetch(url, options)
 | 
					    rss = FeedFetch(url, options)
 | 
				
			||||||
    rss = FeedGather(rss, url, options)
 | 
					    rss = FeedGather(rss, url, options)
 | 
				
			||||||
@@ -544,7 +544,7 @@ def cgi_app(environ, start_response):
 | 
				
			|||||||
    else:
 | 
					    else:
 | 
				
			||||||
        headers['content-type'] = 'text/xml'
 | 
					        headers['content-type'] = 'text/xml'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    crawler.sqlite_default = os.path.join(os.getcwd(), 'morss-cache.db')
 | 
					    crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # get the work done
 | 
					    # get the work done
 | 
				
			||||||
    rss = FeedFetch(url, options)
 | 
					    rss = FeedFetch(url, options)
 | 
				
			||||||
@@ -618,7 +618,7 @@ def cli_app():
 | 
				
			|||||||
    global DEBUG
 | 
					    global DEBUG
 | 
				
			||||||
    DEBUG = options.debug
 | 
					    DEBUG = options.debug
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    crawler.sqlite_default = os.path.expanduser('~/.cache/morss-cache.db')
 | 
					    crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    rss = FeedFetch(url, options)
 | 
					    rss = FeedFetch(url, options)
 | 
				
			||||||
    rss = FeedGather(rss, url, options)
 | 
					    rss = FeedGather(rss, url, options)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user