From bd1c6a5be6833d1ffb322605aaa13cb6ffc37702 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 8 Jan 2014 01:44:45 +0100 Subject: [PATCH] Huge commit: wsgiref-like. "Progress" dropped. Can now run as HTTP server. Main functions split again (Init, Fetch). LIM/MAX_STUFF moved to function body (this still has to be improved). "progress" too hard to get to work with wsgiref, dropped. This whole thing should make it easier to maintain code (no more ifs everywhere). --- morss.py | 204 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 141 insertions(+), 63 deletions(-) diff --git a/morss.py b/morss.py index 0289443..a3be6b8 100644 --- a/morss.py +++ b/morss.py @@ -26,6 +26,10 @@ import urllib2 import chardet import urlparse +import wsgiref.util +import wsgiref.simple_server +import wsgiref.handlers + from gzip import GzipFile from StringIO import StringIO @@ -54,7 +58,7 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET PROTOCOL = ['http', 'https', 'ftp'] -if 'REQUEST_URI' in os.environ: +if 'SCRIPT_NAME' in os.environ: httplib.HTTPConnection.debuglevel = 1 import cgitb @@ -63,9 +67,9 @@ if 'REQUEST_URI' in os.environ: class MorssException(Exception): pass -def log(txt): - if DEBUG: - if HOLD: +def log(txt, force=False): + if DEBUG or force: + if 'REQUEST_URI' in os.environ: open('morss.log', 'a').write("%s\n" % repr(txt)) else: print repr(txt) @@ -84,16 +88,21 @@ def countWord(txt): return 0 class ParseOptions: - def __init__(self): + def __init__(self, environ=False): self.url = '' self.options = {} roptions = [] - if 'REQUEST_URI' in os.environ: - self.url = os.environ['REQUEST_URI'][1:] + if environ: + if 'REQUEST_URI' in environ: + self.url = environ['REQUEST_URI'][1:] + else: + self.url = environ['PATH_INFO'][1:] - if 'REDIRECT_URL' not in os.environ: - self.url = self.url[len(os.environ['SCRIPT_NAME']):] + if self.url.startswith('/morss.py'): + self.url = self.url[10:] + elif self.url.startswith('morss.py'): + self.url = self.url[9:] if self.url.startswith(':'): roptions = self.url.split('/')[0].split(':')[1:] @@ -475,7 +484,7 @@ def Fill(item, cache, feedurl='/', fast=False): return True -def Gather(url, cachePath, options): +def Init(url, cachePath, options): # url clean up log(url) @@ -492,6 +501,9 @@ def Gather(url, cachePath, options): cache = Cache(cachePath, url, options.proxy) log(cache._hash) + return (url, cache) + +def Fetch(url, cache, options): # do some useful facebook work feedify.PreWorker(url, cache) @@ -545,7 +557,7 @@ def Gather(url, cachePath, options): match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") if len(match): link = urlparse.urljoin(url, match[0]) - return Gather(link, cachePath, options) + return Fetch(link, cachePath, options) else: log('no-link html') raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed') @@ -553,14 +565,35 @@ def Gather(url, cachePath, options): log('random page') raise MorssException('Link provided is not a valid feed') + + cache.save() + return rss + +def Gather(rss, url, cache, options): + log('YEAH') + size = len(rss.items) startTime = time.time() # custom settings + global LIM_ITEM + global LIM_TIME + global MAX_ITEM + global MAX_TIME + if options.progress: MAX_TIME = -1 + LIM_TIME = 15 + MAX_ITEM = -1 + LIM_ITEM = -1 if options.cache: MAX_TIME = 0 + if options.OFCOURSENOT: + log('welcome home') + LIM_ITEM = -1 + LIM_TIME = -1 + MAX_ITEM = -1 + MAX_TIME = -1 # set def runner(queue): @@ -601,14 +634,6 @@ def Gather(url, cachePath, options): if not options.keep: del item.desc - if options.progress: - end = size if MAX_ITEM == -1 else min(MAX_ITEM, size) - if options.json: - sys.stdout.write(json.dumps((i+1, end, item), default=lambda o: dict(o)) + "\n") - else: - sys.stdout.write("%s/%s\n" % (i+1, end)) - sys.stdout.flush() - queue = Queue.Queue() for i in range(THREADS): @@ -627,26 +652,93 @@ def Gather(url, cachePath, options): return rss -if __name__ == '__main__': +def cgi_app(environ, start_response): + options = ParseOptions(environ) + url = options.url + headers = {} + + global DEBUG + DEBUG = options.debug + + if 'HTTP_IF_NONE_MATCH' in environ: + if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: + headers['status'] = '304 Not Modified' + start_response(headers['status'], headers.items()) + log(url) + log('etag good') + return [] + + headers['status'] = '200 OK' + headers['etag'] = '"%s"' % int(time.time()) + + if options.html: + headers['content-type'] = 'text/html' + elif options.debug or options.txt: + headers['content-type'] = 'text/plain' + elif options.json: + headers['content-type'] = 'application/json' + else: + headers['content-type'] = 'text/xml' + + url, cache = Init(url, os.getcwd() + '/cache', options) + RSS = Fetch(url, cache, options) + RSS = Gather(RSS, url, cache, options) + + if headers['content-type'] == 'text/xml': + headers['content-type'] = RSS.mimetype + + start_response(headers['status'], headers.items()) + + if not DEBUG and not options.silent: + if options.json: + if options.indent: + return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x)) + else: + return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x)) + else: + return RSS.tostring(xml_declaration=True, encoding='UTF-8') + + log('done') + +def cgi_wrapper(environ, start_response): + try: + return cgi_app(environ, start_response) + except (KeyboardInterrupt, SystemExit): + raise + except MorssException as e: + headers = {} + headers['status'] = '500 Oops' + headers['content-type'] = 'text/plain' + start_response(headers['status'], headers.items(), sys.exc_info()) + return 'Internal Error: %s' % e.message + except Exception as e: + headers = {} + headers['status'] = '500 Oops' + headers['content-type'] = 'text/plain' + start_response(headers['status'], headers.items(), sys.exc_info()) + return 'Unknown Error: %s' % e.message + +def cli_app(): options = ParseOptions() url = options.url - DEBUG = bool(options.debug) + global DEBUG + DEBUG = options.debug - if 'REQUEST_URI' in os.environ: - HOLD = True + url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options) + RSS = Fetch(url, cache, options) + RSS = Gather(RSS, url, cache, options) - if 'HTTP_IF_NONE_MATCH' in os.environ: - if not options.force and not options.facebook and time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: - print 'Status: 304' - print - log(url) - log('etag good') - sys.exit(0) + if not DEBUG and not options.silent: + if options.json: + if options.indent: + print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x)) + else: + print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x)) + else: + print RSS.tostring(xml_declaration=True, encoding='UTF-8') - cachePath = os.getcwd() + '/cache' - else: - cachePath = os.path.expanduser('~') + '/.cache/morss' + log('done') if options.facebook: facebook = Cache(cachePath, 'facebook', persistent=True, dic=True) @@ -685,37 +777,23 @@ if __name__ == '__main__': sys.exit(0) +def main(): if 'REQUEST_URI' in os.environ: - print 'Status: 200' - print 'ETag: "%s"' % int(time.time()) + wsgiref.handlers.CGIHandler().run(cgi_wrapper) - if options.html: - print 'Content-Type: text/html' - elif options.debug or options.txt: - print 'Content-Type: text/plain' - elif options.progress: - print 'Content-Type: application/octet-stream' - elif options.json: - print 'Content-Type: application/json' - else: - print 'Content-Type: text/xml' - print '' + elif len(sys.argv) <= 1: + httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper) + httpd.serve_forever() - HOLD = False + else: + try: + cli_app() + except (KeyboardInterrupt, SystemExit): + raise + except MorssException as e: + print 'Internal Error: %s' % e.message + except Exception as e: + print 'Unknown Error: %s' % e.message - - RSS = Gather(url, cachePath, options) - - if RSS is not False and not options.progress and not DEBUG and not options.silent: - if options.json: - if options.indent: - print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x)) - else: - print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x)) - else: - print RSS.tostring(xml_declaration=True, encoding='UTF-8') - - if RSS is False and 'progress' not in options: - print 'Error fetching feed.' - - log('done') +if __name__ == '__main__': + main()