Huge commit: wsgiref-like. "Progress" dropped.

Can now run as HTTP server. Main functions split again (Init, Fetch). LIM/MAX_STUFF moved to function body (this still has to be improved). "progress" too hard to get to work with wsgiref, dropped. This whole thing should make it easier to maintain code (no more ifs everywhere).
master
pictuga 2014-01-08 01:44:45 +01:00
parent f12031cbdb
commit bd1c6a5be6
1 changed files with 141 additions and 63 deletions

204
morss.py
View File

@ -26,6 +26,10 @@ import urllib2
import chardet import chardet
import urlparse import urlparse
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
from gzip import GzipFile from gzip import GzipFile
from StringIO import StringIO from StringIO import StringIO
@ -54,7 +58,7 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp'] PROTOCOL = ['http', 'https', 'ftp']
if 'REQUEST_URI' in os.environ: if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1 httplib.HTTPConnection.debuglevel = 1
import cgitb import cgitb
@ -63,9 +67,9 @@ if 'REQUEST_URI' in os.environ:
class MorssException(Exception): class MorssException(Exception):
pass pass
def log(txt): def log(txt, force=False):
if DEBUG: if DEBUG or force:
if HOLD: if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt)) open('morss.log', 'a').write("%s\n" % repr(txt))
else: else:
print repr(txt) print repr(txt)
@ -84,16 +88,21 @@ def countWord(txt):
return 0 return 0
class ParseOptions: class ParseOptions:
def __init__(self): def __init__(self, environ=False):
self.url = '' self.url = ''
self.options = {} self.options = {}
roptions = [] roptions = []
if 'REQUEST_URI' in os.environ: if environ:
self.url = os.environ['REQUEST_URI'][1:] if 'REQUEST_URI' in environ:
self.url = environ['REQUEST_URI'][1:]
else:
self.url = environ['PATH_INFO'][1:]
if 'REDIRECT_URL' not in os.environ: if self.url.startswith('/morss.py'):
self.url = self.url[len(os.environ['SCRIPT_NAME']):] self.url = self.url[10:]
elif self.url.startswith('morss.py'):
self.url = self.url[9:]
if self.url.startswith(':'): if self.url.startswith(':'):
roptions = self.url.split('/')[0].split(':')[1:] roptions = self.url.split('/')[0].split(':')[1:]
@ -475,7 +484,7 @@ def Fill(item, cache, feedurl='/', fast=False):
return True return True
def Gather(url, cachePath, options): def Init(url, cachePath, options):
# url clean up # url clean up
log(url) log(url)
@ -492,6 +501,9 @@ def Gather(url, cachePath, options):
cache = Cache(cachePath, url, options.proxy) cache = Cache(cachePath, url, options.proxy)
log(cache._hash) log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
# do some useful facebook work # do some useful facebook work
feedify.PreWorker(url, cache) feedify.PreWorker(url, cache)
@ -545,7 +557,7 @@ def Gather(url, cachePath, options):
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match): if len(match):
link = urlparse.urljoin(url, match[0]) link = urlparse.urljoin(url, match[0])
return Gather(link, cachePath, options) return Fetch(link, cachePath, options)
else: else:
log('no-link html') log('no-link html')
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed') raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
@ -553,14 +565,35 @@ def Gather(url, cachePath, options):
log('random page') log('random page')
raise MorssException('Link provided is not a valid feed') raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
log('YEAH')
size = len(rss.items) size = len(rss.items)
startTime = time.time() startTime = time.time()
# custom settings # custom settings
global LIM_ITEM
global LIM_TIME
global MAX_ITEM
global MAX_TIME
if options.progress: if options.progress:
MAX_TIME = -1 MAX_TIME = -1
LIM_TIME = 15
MAX_ITEM = -1
LIM_ITEM = -1
if options.cache: if options.cache:
MAX_TIME = 0 MAX_TIME = 0
if options.OFCOURSENOT:
log('welcome home')
LIM_ITEM = -1
LIM_TIME = -1
MAX_ITEM = -1
MAX_TIME = -1
# set # set
def runner(queue): def runner(queue):
@ -601,14 +634,6 @@ def Gather(url, cachePath, options):
if not options.keep: if not options.keep:
del item.desc del item.desc
if options.progress:
end = size if MAX_ITEM == -1 else min(MAX_ITEM, size)
if options.json:
sys.stdout.write(json.dumps((i+1, end, item), default=lambda o: dict(o)) + "\n")
else:
sys.stdout.write("%s/%s\n" % (i+1, end))
sys.stdout.flush()
queue = Queue.Queue() queue = Queue.Queue()
for i in range(THREADS): for i in range(THREADS):
@ -627,26 +652,93 @@ def Gather(url, cachePath, options):
return rss return rss
if __name__ == '__main__': def cgi_app(environ, start_response):
options = ParseOptions(environ)
url = options.url
headers = {}
global DEBUG
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
log('etag good')
return []
headers['status'] = '200 OK'
headers['etag'] = '"%s"' % int(time.time())
if options.html:
headers['content-type'] = 'text/html'
elif options.debug or options.txt:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
else:
headers['content-type'] = 'text/xml'
url, cache = Init(url, os.getcwd() + '/cache', options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
start_response(headers['status'], headers.items())
if not DEBUG and not options.silent:
if options.json:
if options.indent:
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
def cgi_wrapper(environ, start_response):
try:
return cgi_app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Internal Error: %s' % e.message
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Unknown Error: %s' % e.message
def cli_app():
options = ParseOptions() options = ParseOptions()
url = options.url url = options.url
DEBUG = bool(options.debug) global DEBUG
DEBUG = options.debug
if 'REQUEST_URI' in os.environ: url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
HOLD = True RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if 'HTTP_IF_NONE_MATCH' in os.environ: if not DEBUG and not options.silent:
if not options.force and not options.facebook and time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: if options.json:
print 'Status: 304' if options.indent:
print print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
log(url)
log('etag good')
sys.exit(0)
cachePath = os.getcwd() + '/cache'
else: else:
cachePath = os.path.expanduser('~') + '/.cache/morss' print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
if options.facebook: if options.facebook:
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True) facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
@ -685,37 +777,23 @@ if __name__ == '__main__':
sys.exit(0) sys.exit(0)
def main():
if 'REQUEST_URI' in os.environ: if 'REQUEST_URI' in os.environ:
print 'Status: 200' wsgiref.handlers.CGIHandler().run(cgi_wrapper)
print 'ETag: "%s"' % int(time.time())
elif len(sys.argv) <= 1:
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
httpd.serve_forever()
if options.html:
print 'Content-Type: text/html'
elif options.debug or options.txt:
print 'Content-Type: text/plain'
elif options.progress:
print 'Content-Type: application/octet-stream'
elif options.json:
print 'Content-Type: application/json'
else: else:
print 'Content-Type: text/xml' try:
print '' cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
print 'Internal Error: %s' % e.message
except Exception as e:
print 'Unknown Error: %s' % e.message
HOLD = False if __name__ == '__main__':
main()
RSS = Gather(url, cachePath, options)
if RSS is not False and not options.progress and not DEBUG and not options.silent:
if options.json:
if options.indent:
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
if RSS is False and 'progress' not in options:
print 'Error fetching feed.'
log('done')