Compare commits

..

No commits in common. "9ce6acba20fc97b1472a8df84abcbee375f26555" and "bbada0436a6230e577599b11b7099c3539af3f01" have entirely different histories.

5 changed files with 28 additions and 89 deletions

View File

@ -5,4 +5,4 @@ RUN apk add python3 py3-lxml py3-gunicorn py3-pip git
ADD . /app
RUN pip3 install /app
CMD gunicorn --bind 0.0.0.0:8080 -w 4 --preload morss
CMD gunicorn --bind 0.0.0.0:8080 -w 4 morss

View File

@ -186,7 +186,7 @@ uwsgi --http :8080 --plugin python --wsgi-file main.py
#### Using Gunicorn
```shell
gunicorn --preload morss
gunicorn morss
```
#### Using docker
@ -316,13 +316,6 @@ will be cleared every time the program is run). Path can be defined with
- `CACHE=mysql`: MySQL cache. Connection can be defined with the following
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
To limit the siz of the cache:
' `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache migth be temporarily bigger than that). Defaults to 10k
entries.
- `CACHE_LIFESPAN` sets how often the cache must be trimmed (i.e. cut down to
the number of items set in `CACHE_SIZE`). Defaults to 1hr.
## Configuration
### Length limitation

View File

@ -25,15 +25,36 @@ from . import cli
from .morss import MorssException
import wsgiref.simple_server
import wsgiref.handlers
PORT = int(os.getenv('PORT', 8080))
def main():
if 'REQUEST_URI' in os.environ:
# mod_cgi (w/o file handler)
wsgi.cgi_handle_request()
app = wsgi.cgi_app
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
elif len(sys.argv) <= 1:
# start internal (basic) http server (w/ file handler)
wsgi.cgi_start_server()
app = wsgi.cgi_app
app = wsgi.cgi_file_handler(app)
app = wsgi.cgi_dispatcher(app)
app = wsgi.cgi_error_handler(app)
app = wsgi.cgi_encode(app)
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, app)
httpd.serve_forever()
else:
# as a CLI app

View File

@ -25,9 +25,7 @@ import chardet
from cgi import parse_header
import lxml.html
import time
import threading
import random
from collections import OrderedDict
try:
# python 2
@ -50,10 +48,6 @@ except NameError:
basestring = unicode = str
CACHE_SIZE = int(os.getenv('CACHE_SIZE', 10000)) # max number of items in cache (default: 10k items)
CACHE_LIFESPAN = int(os.getenv('CACHE_LIFESPAN', 60*60)) # how often to auto-clear the cache (default: 1hr)
# uncomment the lines below to ignore SSL certs
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
@ -611,18 +605,6 @@ class CacheHandler(BaseHandler):
class BaseCache:
""" Subclasses must behave like a dict """
def trim(self):
pass
def autotrim(self, delay=CACHE_LIFESPAN):
# trim the cache every so often
self.trim()
t = threading.Timer(delay, self.autotrim)
t.daemon = True
t.start()
def __contains__(self, url):
try:
self[url]
@ -645,15 +627,9 @@ class SQLiteCache(BaseCache):
self.con.execute('CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT)')
self.con.execute('pragma journal_mode=WAL')
self.trim()
def __del__(self):
self.con.close()
def trim(self):
with self.con:
self.con.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET ? ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
row = self.con.execute('SELECT * FROM data WHERE url=?', (url,)).fetchone()
@ -684,15 +660,9 @@ class MySQLCacheHandler(BaseCache):
with self.cursor() as cursor:
cursor.execute('CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT)')
self.trim()
def cursor(self):
return pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', autocommit=True).cursor()
def trim(self):
with self.cursor() as cursor:
cursor.execute('DELETE FROM data WHERE timestamp <= ( SELECT timestamp FROM ( SELECT timestamp FROM data ORDER BY timestamp DESC LIMIT 1 OFFSET %s ) foo )', (CACHE_SIZE,))
def __getitem__(self, url):
cursor = self.cursor()
cursor.execute('SELECT * FROM data WHERE url=%s', (url,))
@ -709,39 +679,20 @@ class MySQLCacheHandler(BaseCache):
(url,) + value + value)
class CappedDict(OrderedDict, BaseCache):
def trim(self):
if CACHE_SIZE >= 0:
for i in range( max( len(self) - CACHE_SIZE , 0 )):
self.popitem(False)
def __setitem__(self, key, value):
# https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes
if key in self:
del self[key]
OrderedDict.__setitem__(self, key, value)
if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler(
user = os.getenv('MYSQL_USER'),
password = os.getenv('MYSQL_PWD'),
database = os.getenv('MYSQL_DB'),
host = os.getenv('MYSQL_HOST', 'localhost')
host = os.getenv('MYSQL_HOST')
)
elif os.environ['CACHE'] == 'sqlite':
if 'SQLITE_PATH' in os.environ:
path = os.getenv('SQLITE_PATH') + '/morss-cache.db'
else:
path = ':memory:'
default_cache = SQLiteCache(path)
default_cache = SQLiteCache(os.getenv('SQLITE_PATH', ':memory:'))
else:
default_cache = CappedDict()
default_cache = {}
if __name__ == '__main__':

View File

@ -22,8 +22,6 @@ import lxml.etree
import cgitb
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
import mimetypes
try:
@ -39,9 +37,6 @@ from .morss import FeedFetch, FeedGather, FeedFormat
from .morss import Options, log, TIMEOUT, DELAY, MorssException
PORT = int(os.getenv('PORT', 8080))
def parse_options(options):
""" Turns ['md=True'] into {'md':True} """
out = {}
@ -272,24 +267,3 @@ application = cgi_file_handler(application)
application = cgi_dispatcher(application)
application = cgi_error_handler(application)
application = cgi_encode(application)
def cgi_handle_request():
app = cgi_app
app = cgi_dispatcher(app)
app = cgi_error_handler(app)
app = cgi_encode(app)
wsgiref.handlers.CGIHandler().run(app)
def cgi_start_server():
crawler.default_cache.autotrim()
print('Serving http://localhost:%s/' % PORT)
httpd = wsgiref.simple_server.make_server('', PORT, application)
httpd.serve_forever()
if 'gunicorn' in os.getenv('SERVER_SOFTWARE', ''):
crawler.default_cache.autotrim()