crawler: add ability to limit cache size

master
pictuga 2020-09-30 23:59:55 +02:00
parent 2fc7cd391c
commit eed949736a
3 changed files with 63 additions and 2 deletions

View File

@ -316,6 +316,13 @@ will be cleared every time the program is run). Path can be defined with
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following - `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST` environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
To limit the siz of the cache:
' `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache migth be temporarily bigger than that). Defaults to 10k
entries.
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
the number of items set in `CACHE_SIZE`). Defaults to 1hr.
## Configuration ## Configuration
### Length limitation ### Length limitation

View File

@ -25,7 +25,9 @@ import chardet
from cgi import parse_header from cgi import parse_header
import lxml.html import lxml.html
import time import time
import threading
import random import random
from collections import OrderedDict
try: try:
# python 2 # python 2
@ -48,6 +50,10 @@ except NameError:
basestring = unicode = str basestring = unicode = str
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 10000)) # max number of items in cache (default: 10k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60*60)) # how often to auto-clear the cache (default: 1hr)
# uncomment the lines below to ignore SSL certs # uncomment the lines below to ignore SSL certs
#import ssl #import ssl
#ssl._create_default_https_context = ssl._create_unverified_context #ssl._create_default_https_context = ssl._create_unverified_context
@ -605,6 +611,17 @@ class CacheHandler(BaseHandler):
class BaseCache: class BaseCache:
""" Subclasses must behave like a dict """ """ Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.start()
def __contains__(self, url): def __contains__(self, url):
try: try:
self[url] self[url]
@ -627,9 +644,15 @@ class SQLiteCache(BaseCache):
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)') self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL') self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self): def __del__(self):
self.con.close() self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, url): def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone() row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
@ -660,9 +683,15 @@ class MySQLCacheHandler(BaseCache):
with self.cursor() as cursor: with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)') cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
self.trim()
def cursor(self): def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor() return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, url): def __getitem__(self, url):
cursor = self.cursor() cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,)) cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
@ -679,6 +708,19 @@ class MySQLCacheHandler(BaseCache):
(url,) + value + value) (url,) + value + value)
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, value):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, value)
if 'CACHE' in os.environ: if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql': if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler( default_cache = MySQLCacheHandler(
@ -689,10 +731,16 @@ if 'CACHE' in os.environ:
) )
elif os.environ['CACHE'] == 'sqlite': elif os.environ['CACHE'] == 'sqlite':
default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:')) if 'SQLITE_PATH' in os.environ:
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
else:
path = ':memory:'
default_cache = SQLiteCache(path)
else: else:
default_cache = {} default_cache = CappedDict()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -284,6 +284,12 @@ def cgi_handle_request():
def cgi_start_server(): def cgi_start_server():
crawler.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT) print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application) httpd = wsgiref.simple_server.make_server('', PORT, application)
httpd.serve_forever() httpd.serve_forever()
if "gunicorn" in os.getenv('SERVER_SOFTWARE'):
crawler.default_cache.autotrim()