Compare commits

..

No commits in common. "cb21871c3529715e8a91377e21977864d68e6cb3" and "1ff7e4103cff72209e390203463f21bb04ac267f" have entirely different histories.

5 changed files with 100 additions and 164 deletions

View File

@ -40,7 +40,7 @@ Some features of morss:
- Follow 301/meta redirects - Follow 301/meta redirects
- Recover xml feeds with corrupt encoding - Recover xml feeds with corrupt encoding
- Supports gzip-compressed http content - Supports gzip-compressed http content
- HTTP caching with different backends (in-memory/sqlite/mysql/redis/diskcache) - HTTP caching with 3 different backends (in-memory/sqlite/mysql)
- Works as server/cli tool - Works as server/cli tool
- Deobfuscate various tracking links - Deobfuscate various tracking links
@ -60,8 +60,8 @@ Full installation (including optional dependencies)
pip install git+https://git.pictuga.com/pictuga/morss.git#[full] pip install git+https://git.pictuga.com/pictuga/morss.git#[full]
``` ```
The full install includes mysql, redis and diskcache (possible cache backends). The full install includes mysql and redis (possible cache backends). Otherwise,
Otherwise, only in-memory and sqlite3 caches are available. only in-memory and sqlite3 caches are available.
The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as The dependency `lxml` is fairly long to install (especially on Raspberry Pi, as
C code needs to be compiled). If possible on your distribution, try installing C code needs to be compiled). If possible on your distribution, try installing
@ -390,14 +390,12 @@ will be cleared every time the program is run). Path can be defined with
environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST` environment variables: `MYSQL_USER`, `MYSQL_PWD`, `MYSQL_DB`, `MYSQL_HOST`
- `CACHE=redis`: Redis cache. Connection can be defined with the following - `CACHE=redis`: Redis cache. Connection can be defined with the following
environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD` environment variables: `REDIS_HOST`, `REDIS_PORT`, `REDIS_DB`, `REDIS_PWD`
- `CACHE=diskcache`: disk-based cache. Target directory canbe defined with
`DISKCAHE_DIR`.
To limit the size of the cache: To limit the size of the cache:
- `CACHE_SIZE` sets the target number of items in the cache (further items will - `CACHE_SIZE` sets the target number of items in the cache (further items will
be deleted but the cache might be temporarily bigger than that). Defaults to 1k be deleted but the cache might be temporarily bigger than that). Defaults to 1k
entries. NB. When using `diskcache`, this is the cache max size in Bytes. entries.
- `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut - `CACHE_LIFESPAN` (seconds) sets how often the cache must be trimmed (i.e. cut
down to the number of items set in `CACHE_SIZE`). Defaults to 1min. down to the number of items set in `CACHE_SIZE`). Defaults to 1min.

View File

@ -58,8 +58,8 @@ except ImportError:
class SQLiteCache(BaseCache): class SQLiteCache(BaseCache):
def __init__(self, path=':memory:'): def __init__(self, filename=':memory:'):
self.con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False) self.con = sqlite3.connect(filename, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
with self.con: with self.con:
self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)') self.con.execute('CREATE TABLE IF NOT EXISTS data (ky UNICODE PRIMARY KEY, data BLOB, timestamp INT)')
@ -158,29 +158,6 @@ class RedisCacheHandler(BaseCache):
self.r.set(key, data) self.r.set(key, data)
try:
import diskcache # isort:skip
except ImportError:
pass
class DiskCacheHandler(BaseCache):
def __init__(self, directory=None, **kwargs):
self.cache = diskcache.Cache(directory=directory, eviction_policy='least-frequently-used', **kwargs)
def __del__(self):
self.cache.close()
def trim(self):
self.cache.cull()
def __getitem__(self, key):
return self.cache[key]
def __setitem__(self, key, data):
self.cache.set(key, data)
if 'CACHE' in os.environ: if 'CACHE' in os.environ:
if os.environ['CACHE'] == 'mysql': if os.environ['CACHE'] == 'mysql':
default_cache = MySQLCacheHandler( default_cache = MySQLCacheHandler(
@ -191,9 +168,13 @@ if 'CACHE' in os.environ:
) )
elif os.environ['CACHE'] == 'sqlite': elif os.environ['CACHE'] == 'sqlite':
default_cache = SQLiteCache( if 'SQLITE_PATH' in os.environ:
os.getenv('SQLITE_PATH', ':memory:') path = os.getenv('SQLITE_PATH')
)
else:
path = ':memory:'
default_cache = SQLiteCache(path)
elif os.environ['CACHE'] == 'redis': elif os.environ['CACHE'] == 'redis':
default_cache = RedisCacheHandler( default_cache = RedisCacheHandler(
@ -203,11 +184,5 @@ if 'CACHE' in os.environ:
password = os.getenv('REDIS_PWD', None) password = os.getenv('REDIS_PWD', None)
) )
elif os.environ['CACHE'] == 'diskcache':
default_cache = DiskCacheHandler(
directory = os.getenv('DISKCAHE_DIR', '/tmp/morss-diskcache'),
size_limit = CACHE_SIZE # in Bytes
)
else: else:
default_cache = CappedDict() default_cache = CappedDict()

View File

@ -19,6 +19,7 @@ import os
import pickle import pickle
import random import random
import re import re
import sys
import time import time
import zlib import zlib
from cgi import parse_header from cgi import parse_header
@ -33,14 +34,14 @@ try:
# python 2 # python 2
from urllib import quote from urllib import quote
from mimetools import Message as message_from_string import mimetools
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list, Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list) parse_keqv_list)
from urlparse import urlparse, urlunparse from urlparse import urlparse, urlunparse
except ImportError: except ImportError:
# python 3 # python 3
from email import message_from_string import email
from urllib.parse import quote, urlparse, urlunparse from urllib.parse import quote, urlparse, urlunparse
from urllib.request import (BaseHandler, HTTPCookieProcessor, from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl, HTTPRedirectHandler, Request, addinfourl,
@ -108,7 +109,7 @@ def adv_get(url, post=None, timeout=None, *args, **kwargs):
} }
def custom_opener(follow=None, policy=None, force_min=None, force_max=None): def custom_opener(follow=None, delay=None):
handlers = [] handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
@ -142,7 +143,7 @@ def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
if follow: if follow:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(AlternateHandler(MIMETYPE[follow]))
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max)) handlers.append(CacheHandler(force_min=delay))
return build_opener(*handlers) return build_opener(*handlers)
@ -426,50 +427,31 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response https_response = http_response
def error_response(code, msg, url=''):
# return an error as a response
resp = addinfourl(BytesIO(), message_from_string('\n\n'), url, code)
resp.msg = msg
return resp
class CacheHandler(BaseHandler): class CacheHandler(BaseHandler):
" Cache based on etags/last-modified " " Cache based on etags/last-modified "
privacy = 'private' # Websites can indicate whether the page should be cached private_cache = False # Websites can indicate whether the page should be
# by CDNs (e.g. shouldn't be the case for # cached by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages. With this # private/confidential/user-specific pages.
# setting, decide whether you want the cache to behave # With this setting, decide whether (False) you want
# like a CDN (i.e. don't cache private pages, 'public'), # the cache to behave like a CDN (i.e. don't cache
# or to behave like a end-user private pages # private pages), or (True) to behave like a end-cache
# ('private'). If unsure, 'public' is the safest bet, # private pages. If unsure, False is the safest bet.
# but many websites abuse this feature...
# NB. This overrides all the other min/max/policy settings.
handler_order = 499 handler_order = 499
def __init__(self, cache=None, force_min=None, force_max=None, policy=None): def __init__(self, cache=None, force_min=None):
self.cache = cache or default_cache self.cache = cache or default_cache
self.force_min = force_min self.force_min = force_min
self.force_max = force_max # Servers indicate how long they think their content is "valid".
self.policy = policy # can be cached/refresh/offline/None (default) # With this parameter (force_min, expressed in seconds), we can
# override the validity period (i.e. bypassing http headers)
# Servers indicate how long they think their content is "valid". With # Special values:
# this parameter (force_min/max, expressed in seconds), we can override # -1: valid forever, i.e. use the cache no matter what (and fetch
# the validity period (i.e. bypassing http headers) # the page online if not present in cache)
# Special choices, via "policy": # 0: valid zero second, i.e. force refresh
# cached: use the cache no matter what (and fetch the page online if # -2: same as -1, i.e. use the cache no matter what, but do NOT
# not present in cache) # fetch the page online if not present in cache, throw an
# refresh: valid zero second, i.e. force refresh # error instead
# offline: same as cached, i.e. use the cache no matter what, but do
# NOT fetch the page online if not present in cache, throw an
# error instead
# None: just follow protocols
# sanity checks
assert self.force_max is None or self.force_max >= 0
assert self.force_min is None or self.force_min >= 0
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
def load(self, url): def load(self, url):
try: try:
@ -479,7 +461,10 @@ class CacheHandler(BaseHandler):
data = None data = None
else: else:
data['headers'] = message_from_string(data['headers'] or unicode()) # headers if sys.version_info[0] >= 3:
data['headers'] = email.message_from_string(data['headers'] or unicode()) # headers
else:
data['headers'] = mimetools.Message(StringIO(data['headers'] or unicode()))
return data return data
@ -487,17 +472,18 @@ class CacheHandler(BaseHandler):
data['headers'] = unicode(data['headers']) data['headers'] = unicode(data['headers'])
self.cache[key] = pickle.dumps(data, 0) self.cache[key] = pickle.dumps(data, 0)
def cached_response(self, req, fallback=None): def is_cached(self, key):
return self.load(key) is not None
def cached_response(self, req):
# this does NOT check whether it's already cached, use with care
data = self.load(req.get_full_url()) data = self.load(req.get_full_url())
if data is not None: # return the cache as a response
# return the cache as a response resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code']) resp.msg = data['msg']
resp.msg = data['msg']
return resp
else: return resp
return fallback
def save_response(self, req, resp): def save_response(self, req, resp):
data = resp.read() data = resp.read()
@ -505,7 +491,7 @@ class CacheHandler(BaseHandler):
self.save(req.get_full_url(), { self.save(req.get_full_url(), {
'code': resp.code, 'code': resp.code,
'msg': resp.msg, 'msg': resp.msg,
'headers': str(resp.headers), 'headers': resp.headers,
'data': data, 'data': data,
'timestamp': time.time() 'timestamp': time.time()
}) })
@ -534,74 +520,60 @@ class CacheHandler(BaseHandler):
# If 'None' is returned, try your chance with the next-available handler # If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response' # If a 'resp' is returned, stop there, and proceed with 'http_response'
# Here, we try to see whether we want to use data from cache (i.e.
# return 'resp'), or whether we want to refresh the content (return
# 'None')
data = self.load(req.get_full_url()) data = self.load(req.get_full_url())
if data is not None: if data is None:
# some info needed to process everything # cache empty, refresh
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
cache_control += parse_http_list(data['headers'].get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x]
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
cache_age = time.time() - data['timestamp']
# list in a simple way what to do in special cases
if data is not None and 'private' in cc_list and self.privacy == 'public':
# private data but public cache, do not use cache
# privacy concern, so handled first and foremost
# (and doesn't need to be addressed anymore afterwards)
return None return None
elif self.policy == 'offline': # some info needed to process everything
# use cache, or return an error cache_control = parse_http_list(data['headers'].get('cache-control', ()))
return self.cached_response( cache_control += parse_http_list(data['headers'].get('pragma', ()))
req,
error_response(409, 'Conflict', req.get_full_url())
)
elif self.policy == 'cached': cc_list = [x for x in cache_control if '=' not in x]
# use cache, or fetch online cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
return self.cached_response(req, None)
elif self.policy == 'refresh': cache_age = time.time() - data['timestamp']
# list in a simple way what to do when
if self.force_min == -2:
if data['code'] is not None:
# already in cache, perfect, use cache
return self.cached_response(req)
else:
# raise an error, via urllib handlers
resp = addinfourl(BytesIO(), data['headers'], req.get_full_url(), 409)
resp.msg = 'Conflict'
return resp
elif self.force_min == -1:
# force use cache
return self.cached_response(req)
elif self.force_min == 0:
# force refresh # force refresh
return None return None
elif data is None:
# we have already settled all the cases that don't need the cache.
# all the following ones need the cached item
return None
elif self.force_max is not None and cache_age > self.force_max:
# older than we want, refresh
return None
elif self.force_min is not None and cache_age < self.force_min:
# recent enough, use cache
return self.cached_response(req)
elif data['code'] == 301 and cache_age < 7*24*3600: elif data['code'] == 301 and cache_age < 7*24*3600:
# "301 Moved Permanently" has to be cached...as long as we want # "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0 # (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh) # if you want to bypass this (needed for a proper refresh)
return self.cached_response(req) return self.cached_response(req)
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list): elif (self.force_min is None or self.force_min > 0) and ('no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache)):
# kindly follow web servers indications, refresh if the same # kindly follow web servers indications, refresh
# settings are used all along, this section shouldn't be of any use, # if the same settings are used all along, this section shouldn't be
# since the page woudln't be cached in the first place the check is # of any use, since the page woudln't be cached in the first place
# only performed "just in case" # the check is only performed "just in case"
# NB. NOT respected if force_min is set
return None return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age: elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
# server says it's still fine (and we trust him, if not, use overrides), use cache # server says it's still fine (and we trust him, if not, use force_min=0), use cache
return self.cached_response(req)
elif self.force_min is not None and self.force_min > cache_age:
# still recent enough for us, use cache
return self.cached_response(req) return self.cached_response(req)
else: else:
@ -612,19 +584,19 @@ class CacheHandler(BaseHandler):
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will) # code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
# NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life # NB. It might re-save requests pulled from cache, which will re-set the time() to the latest, i.e. lenghten its useful life
if resp.code == 304 and resp.url in self.cache: if resp.code == 304 and self.is_cached(resp.url):
# we are hopefully the first after the HTTP handler, so no need # we are hopefully the first after the HTTP handler, so no need
# to re-run all the *_response # to re-run all the *_response
# here: cached page, returning from cache # here: cached page, returning from cache
return self.cached_response(req) return self.cached_response(req)
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers): elif ('cache-control' in resp.headers or 'pragma' in resp.headers) and self.force_min is None:
cache_control = parse_http_list(resp.headers.get('cache-control', ())) cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ())) cache_control += parse_http_list(resp.headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x] cc_list = [x for x in cache_control if '=' not in x]
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'): if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and not self.private_cache):
# kindly follow web servers indications (do not save & return) # kindly follow web servers indications (do not save & return)
return resp return resp
@ -646,8 +618,6 @@ if 'IGNORE_SSL' in os.environ:
if __name__ == '__main__': if __name__ == '__main__':
import sys
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if sys.flags.interactive: if sys.flags.interactive:

View File

@ -194,20 +194,21 @@ def ItemFill(item, options, feedurl='/', fast=False):
log(item.link) log(item.link)
# download # download
delay = -1
if fast or options.cache: if fast or options.fast:
# force cache, don't fetch # force cache, don't fetch
policy = 'offline' delay = -2
elif options.force: elif options.force:
# force refresh # force refresh
policy = 'refresh' delay = 0
else: else:
policy = None delay = 24*60*60 # 24h
try: try:
req = crawler.adv_get(url=item.link, policy=policy, force_min=24*60*60, timeout=TIMEOUT) req = crawler.adv_get(url=item.link, delay=delay, timeout=TIMEOUT)
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
log('http error') log('http error')
@ -265,17 +266,11 @@ def FeedFetch(url, options):
# fetch feed # fetch feed
delay = DELAY delay = DELAY
if options.cache: if options.force:
policy = 'offline' delay = 0
elif options.force:
policy = 'refresh'
else:
policy = None
try: try:
req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), policy=policy, force_min=5*60, force_max=60*60, timeout=TIMEOUT * 2) req = crawler.adv_get(url=url, post=options.post, follow=('rss' if not options.items else None), delay=delay, timeout=TIMEOUT * 2)
except (IOError, HTTPException): except (IOError, HTTPException):
raise MorssException('Error downloading feed') raise MorssException('Error downloading feed')
@ -329,7 +324,7 @@ def FeedGather(rss, url, options):
max_time = 0 max_time = 0
if options.newest: if options.newest:
# :newest take the newest items (instead of appearing order) # :newest take the newest items
now = datetime.now(tz.tzutc()) now = datetime.now(tz.tzutc())
sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True) sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True)
@ -338,7 +333,6 @@ def FeedGather(rss, url, options):
sorted_items = list(rss.items) sorted_items = list(rss.items)
for i, item in enumerate(sorted_items): for i, item in enumerate(sorted_items):
# hard cap
if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0: if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0:
log('dropped') log('dropped')
item.remove() item.remove()
@ -351,7 +345,6 @@ def FeedGather(rss, url, options):
item = ItemFix(item, options, url) item = ItemFix(item, options, url)
# soft cap
if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0: if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0:
if not options.proxy: if not options.proxy:
if ItemFill(item, options, url, True) is False: if ItemFill(item, options, url, True) is False:

View File

@ -14,7 +14,7 @@ setup(
license = 'AGPL v3', license = 'AGPL v3',
packages = [package_name], packages = [package_name],
install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'], install_requires = ['lxml', 'bs4', 'python-dateutil', 'chardet'],
extras_require = {'full': ['pymysql', 'redis', 'diskcache']}, extras_require = {'full': ['pymysql', 'redis']},
package_data = {package_name: ['feedify.ini']}, package_data = {package_name: ['feedify.ini']},
data_files = [ data_files = [
('share/' + package_name, ['README.md', 'LICENSE']), ('share/' + package_name, ['README.md', 'LICENSE']),