Huge commit: wsgiref-like. "Progress" dropped.

Can now run as HTTP server. Main functions split again (Init, Fetch). LIM/MAX_STUFF moved to function body (this still has to be improved). "progress" too hard to get to work with wsgiref, dropped. This whole thing should make it easier to maintain code (no more ifs everywhere).
master
pictuga 2014-01-08 01:44:45 +01:00
parent f12031cbdb
commit bd1c6a5be6
1 changed files with 141 additions and 63 deletions

204
morss.py
View File

@ -26,6 +26,10 @@ import urllib2
import chardet
import urlparse
import wsgiref.util
import wsgiref.simple_server
import wsgiref.handlers
from gzip import GzipFile
from StringIO import StringIO
@ -54,7 +58,7 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp']
if 'REQUEST_URI' in os.environ:
if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
@ -63,9 +67,9 @@ if 'REQUEST_URI' in os.environ:
class MorssException(Exception):
pass
def log(txt):
if DEBUG:
if HOLD:
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
print repr(txt)
@ -84,16 +88,21 @@ def countWord(txt):
return 0
class ParseOptions:
def __init__(self):
def __init__(self, environ=False):
self.url = ''
self.options = {}
roptions = []
if 'REQUEST_URI' in os.environ:
self.url = os.environ['REQUEST_URI'][1:]
if environ:
if 'REQUEST_URI' in environ:
self.url = environ['REQUEST_URI'][1:]
else:
self.url = environ['PATH_INFO'][1:]
if 'REDIRECT_URL' not in os.environ:
self.url = self.url[len(os.environ['SCRIPT_NAME']):]
if self.url.startswith('/morss.py'):
self.url = self.url[10:]
elif self.url.startswith('morss.py'):
self.url = self.url[9:]
if self.url.startswith(':'):
roptions = self.url.split('/')[0].split(':')[1:]
@ -475,7 +484,7 @@ def Fill(item, cache, feedurl='/', fast=False):
return True
def Gather(url, cachePath, options):
def Init(url, cachePath, options):
# url clean up
log(url)
@ -492,6 +501,9 @@ def Gather(url, cachePath, options):
cache = Cache(cachePath, url, options.proxy)
log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
# do some useful facebook work
feedify.PreWorker(url, cache)
@ -545,7 +557,7 @@ def Gather(url, cachePath, options):
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
return Gather(link, cachePath, options)
return Fetch(link, cachePath, options)
else:
log('no-link html')
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
@ -553,14 +565,35 @@ def Gather(url, cachePath, options):
log('random page')
raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
log('YEAH')
size = len(rss.items)
startTime = time.time()
# custom settings
global LIM_ITEM
global LIM_TIME
global MAX_ITEM
global MAX_TIME
if options.progress:
MAX_TIME = -1
LIM_TIME = 15
MAX_ITEM = -1
LIM_ITEM = -1
if options.cache:
MAX_TIME = 0
if options.OFCOURSENOT:
log('welcome home')
LIM_ITEM = -1
LIM_TIME = -1
MAX_ITEM = -1
MAX_TIME = -1
# set
def runner(queue):
@ -601,14 +634,6 @@ def Gather(url, cachePath, options):
if not options.keep:
del item.desc
if options.progress:
end = size if MAX_ITEM == -1 else min(MAX_ITEM, size)
if options.json:
sys.stdout.write(json.dumps((i+1, end, item), default=lambda o: dict(o)) + "\n")
else:
sys.stdout.write("%s/%s\n" % (i+1, end))
sys.stdout.flush()
queue = Queue.Queue()
for i in range(THREADS):
@ -627,26 +652,93 @@ def Gather(url, cachePath, options):
return rss
if __name__ == '__main__':
def cgi_app(environ, start_response):
options = ParseOptions(environ)
url = options.url
headers = {}
global DEBUG
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
log('etag good')
return []
headers['status'] = '200 OK'
headers['etag'] = '"%s"' % int(time.time())
if options.html:
headers['content-type'] = 'text/html'
elif options.debug or options.txt:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
else:
headers['content-type'] = 'text/xml'
url, cache = Init(url, os.getcwd() + '/cache', options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
start_response(headers['status'], headers.items())
if not DEBUG and not options.silent:
if options.json:
if options.indent:
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
log('done')
def cgi_wrapper(environ, start_response):
try:
return cgi_app(environ, start_response)
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Internal Error: %s' % e.message
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Unknown Error: %s' % e.message
def cli_app():
options = ParseOptions()
url = options.url
DEBUG = bool(options.debug)
global DEBUG
DEBUG = options.debug
if 'REQUEST_URI' in os.environ:
HOLD = True
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if 'HTTP_IF_NONE_MATCH' in os.environ:
if not options.force and not options.facebook and time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
print 'Status: 304'
print
log(url)
log('etag good')
sys.exit(0)
if not DEBUG and not options.silent:
if options.json:
if options.indent:
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
cachePath = os.getcwd() + '/cache'
else:
cachePath = os.path.expanduser('~') + '/.cache/morss'
log('done')
if options.facebook:
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
@ -685,37 +777,23 @@ if __name__ == '__main__':
sys.exit(0)
def main():
if 'REQUEST_URI' in os.environ:
print 'Status: 200'
print 'ETag: "%s"' % int(time.time())
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
if options.html:
print 'Content-Type: text/html'
elif options.debug or options.txt:
print 'Content-Type: text/plain'
elif options.progress:
print 'Content-Type: application/octet-stream'
elif options.json:
print 'Content-Type: application/json'
else:
print 'Content-Type: text/xml'
print ''
elif len(sys.argv) <= 1:
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
httpd.serve_forever()
HOLD = False
else:
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
print 'Internal Error: %s' % e.message
except Exception as e:
print 'Unknown Error: %s' % e.message
RSS = Gather(url, cachePath, options)
if RSS is not False and not options.progress and not DEBUG and not options.silent:
if options.json:
if options.indent:
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
else:
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
else:
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
if RSS is False and 'progress' not in options:
print 'Error fetching feed.'
log('done')
if __name__ == '__main__':
main()