Compare commits

...

6 Commits

Author SHA1 Message Date
pictuga 9ce6acba20 Fix gunicorn related typo 2020-10-01 00:07:41 +02:00
pictuga 6192ff4081 gunicorn with --preload
To only load the code once (and start autotrim once)
2020-10-01 00:05:39 +02:00
pictuga 056a1b143f crawler: autotrim: make ctrl+c working 2020-10-01 00:04:36 +02:00
pictuga eed949736a crawler: add ability to limit cache size 2020-09-30 23:59:55 +02:00
pictuga 2fc7cd391c Shift __main__'s wsgi code where it belongs 2020-09-30 23:24:51 +02:00
pictuga d9f46b23a6 crawler: default value for MYSQL_HOST (localhost) 2020-09-30 13:17:02 +02:00
5 changed files with 89 additions and 28 deletions

View File

@ -5,4 +5,4 @@ RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
ADD . /app ADD . /app
RUN pip3 install /app RUN pip3 install /app
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss

View File

@ -186,7 +186,7 @@ uwsgi --http :8080 --plugin python --wsgi-file main.py
#### Using Gunicorn #### Using Gunicorn
```shell ```shell
gunicorn morss gunicorn --preload morss
``` ```
#### Using docker #### Using docker
@ -316,6 +316,13 @@ will be cleared every time the program is run). Path can be defined with
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following - `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST` environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
To limit the siz of the cache:
' `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache migth be temporarily bigger than that). Defaults to 10k
entries.
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
the number of items set in `CACHE_SIZE`). Defaults to 1hr.
## Configuration ## Configuration
### Length limitation ### Length limitation

View File

@ -25,36 +25,15 @@ from . import cli
from .morss import MorssException from .morss import MorssException
import wsgiref.simple_server
import wsgiref.handlers
PORT = int(os.getenv('PORT', 8080))
def main(): def main():
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
# mod_cgi (w/o file handler) # mod_cgi (w/o file handler)
wsgi.cgi_handle_request()
app = wsgi.cgi_app
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1: elif len(sys.argv) <= 1:
# start internal (basic) http server (w/ file handler) # start internal (basic) http server (w/ file handler)
wsgi.cgi_start_server()
app = wsgi.cgi_app
app = wsgi.cgi_file_handler(app)
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, app)
httpd.serve_forever()
else: else:
# as a CLI app # as a CLI app

View File

@ -25,7 +25,9 @@ import chardet
from cgi import parse_header from cgi import parse_header
import lxml.html import lxml.html
import time import time
import threading
import random import random
from collections import OrderedDict
try: try:
# python 2 # python 2
@ -48,6 +50,10 @@ except NameError:
basestring = unicode = str basestring = unicode = str
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 10000)) # max number of items in cache (default: 10k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60*60)) # how often to auto-clear the cache (default: 1hr)
# uncomment the lines below to ignore SSL certs # uncomment the lines below to ignore SSL certs
#import ssl #import ssl
#ssl._create_default_https_context = ssl._create_unverified_context #ssl._create_default_https_context = ssl._create_unverified_context
@ -605,6 +611,18 @@ class CacheHandler(BaseHandler):
class BaseCache: class BaseCache:
""" Subclasses must behave like a dict """ """ Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url): def __contains__(self, url):
try: try:
self[url] self[url]
@ -627,9 +645,15 @@ class SQLiteCache(BaseCache):
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)') self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL') self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self): def __del__(self):
self.con.close() self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, url): def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone() row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
@ -660,9 +684,15 @@ class MySQLCacheHandler(BaseCache):
with self.cursor() as cursor: with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)') cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
self.trim()
def cursor(self): def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor() return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, url): def __getitem__(self, url):
cursor = self.cursor() cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,)) cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
@ -679,20 +709,39 @@ class MySQLCacheHandler(BaseCache):
(url,) + value + value) (url,) + value + value)
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, value):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, value)
if 'CACHE' in os.environ: if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql': if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler( default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'), user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'), password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'), database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST') host = os.getenv('MYSQL_HOST', 'localhost')
) )
elif os.environ['CACHE'] == 'sqlite': elif os.environ['CACHE'] == 'sqlite':
default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:')) if 'SQLITE_PATH' in os.environ:
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
else: else:
default_cache = {} path = ':memory:'
default_cache = SQLiteCache(path)
else:
default_cache = CappedDict()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -22,6 +22,8 @@ import lxml.etree
import cgitb import cgitb
import wsgiref.util import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
import mimetypes import mimetypes
try: try:
@ -37,6 +39,9 @@ from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, log, TIMEOUT, DELAY, MorssException from .morss import Options, log, TIMEOUT, DELAY, MorssException
PORT = int(os.getenv('PORT', 8080))
def parse_options(options): def parse_options(options):
""" Turns ['md=True'] into {'md':True} """ """ Turns ['md=True'] into {'md':True} """
out = {} out = {}
@ -267,3 +272,24 @@ application = cgi_file_handler(application)
application = cgi_dispatcher(application) application = cgi_dispatcher(application)
application = cgi_error_handler(application) application = cgi_error_handler(application)
application = cgi_encode(application) application = cgi_encode(application)
def cgi_handle_request():
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
def cgi_start_server():
crawler.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application)
httpd.serve_forever()
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
crawler.default_cache.autotrim()