From eed949736adaf4190980102a3a663401891bcf53 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 30 Sep 2020 23:59:55 +0200 Subject: [PATCH] crawler: add ability to limit cache size --- README.md | 7 +++++++ morss/crawler.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++-- morss/wsgi.py | 6 ++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index be4bc6f..dfaf66c 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,13 @@ will be cleared every time the program is run). Path can be defined with - `CACHE=mysql`: MySQL cache. Connection can be defined with the following environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST` +To limit the siz of the cache: +' `CACHE_SIZE` sets the target number of items in the cache (further items will +be deleted but the cache migth be temporarily bigger than that). Defaults to 10k +entries. +- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to +the number of items set in `CACHE_SIZE`). Defaults to 1hr. + ## Configuration ### Length limitation diff --git a/morss/crawler.py b/morss/crawler.py index c9b5961..474db4b 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -25,7 +25,9 @@ import chardet from cgi import parse_header import lxml.html import time +import threading import random +from collections import OrderedDict try: # python 2 @@ -48,6 +50,10 @@ except NameError: basestring = unicode = str +CACHE_SIZE = int(os.getenv('CACHE_SIZE', 10000)) # max number of items in cache (default: 10k items) +CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60*60)) # how often to auto-clear the cache (default: 1hr) + + # uncomment the lines below to ignore SSL certs #import ssl #ssl._create_default_https_context = ssl._create_unverified_context @@ -605,6 +611,17 @@ class CacheHandler(BaseHandler): class BaseCache: """ Subclasses must behave like a dict """ + def trim(self): + pass + + def autotrim(self, delay=CACHE_LIFESPAN): + # trim the cache every so often + + self.trim() + + t = threading.Timer(delay, self.autotrim) + t.start() + def __contains__(self, url): try: self[url] @@ -627,9 +644,15 @@ class SQLiteCache(BaseCache): self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)') self.con.execute('pragma journal_mode=WAL') + self.trim() + def __del__(self): self.con.close() + def trim(self): + with self.con: + self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,)) + def __getitem__(self, url): row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone() @@ -660,9 +683,15 @@ class MySQLCacheHandler(BaseCache): with self.cursor() as cursor: cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)') + self.trim() + def cursor(self): return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor() + def trim(self): + with self.cursor() as cursor: + cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,)) + def __getitem__(self, url): cursor = self.cursor() cursor.execute('SELECT * FROM data WHERE url=%s', (url,)) @@ -679,6 +708,19 @@ class MySQLCacheHandler(BaseCache): (url,) + value + value) +class CappedDict(OrderedDict, BaseCache): + def trim(self): + if CACHE_SIZE >= 0: + for i in range( max( len(self) - CACHE_SIZE , 0 )): + self.popitem(False) + + def __setitem__(self, key, value): + # https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes + if key in self: + del self[key] + OrderedDict.__setitem__(self, key, value) + + if 'CACHE' in os.environ: if os.environ['CACHE'] == 'mysql': default_cache = MySQLCacheHandler( @@ -689,10 +731,16 @@ if 'CACHE' in os.environ: ) elif os.environ['CACHE'] == 'sqlite': - default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:')) + if 'SQLITE_PATH' in os.environ: + path = os.getenv('SQLITE_PATH') + '/morss-cache.db' + + else: + path = ':memory:' + + default_cache = SQLiteCache(path) else: - default_cache = {} + default_cache = CappedDict() if __name__ == '__main__': diff --git a/morss/wsgi.py b/morss/wsgi.py index 0a98c39..2a3d422 100644 --- a/morss/wsgi.py +++ b/morss/wsgi.py @@ -284,6 +284,12 @@ def cgi_handle_request(): def cgi_start_server(): + crawler.default_cache.autotrim() + print('Serving http://localhost:%s/' % PORT) httpd = wsgiref.simple_server.make_server('', PORT, application) httpd.serve_forever() + + +if "gunicorn" in os.getenv('SERVER_SOFTWARE'): + crawler.default_cache.autotrim()