Huge commit: wsgiref-like. "Progress" dropped.
Can now run as HTTP server. Main functions split again (Init, Fetch). LIM/MAX_STUFF moved to function body (this still has to be improved). "progress" too hard to get to work with wsgiref, dropped. This whole thing should make it easier to maintain code (no more ifs everywhere).master
parent
f12031cbdb
commit
bd1c6a5be6
204
morss.py
204
morss.py
|
@ -26,6 +26,10 @@ import urllib2
|
||||||
import chardet
|
import chardet
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
import wsgiref.util
|
||||||
|
import wsgiref.simple_server
|
||||||
|
import wsgiref.handlers
|
||||||
|
|
||||||
from gzip import GzipFile
|
from gzip import GzipFile
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
@ -54,7 +58,7 @@ FBAPPTOKEN = FBAPPID + '|' + FBSECRET
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https', 'ftp']
|
||||||
|
|
||||||
if 'REQUEST_URI' in os.environ:
|
if 'SCRIPT_NAME' in os.environ:
|
||||||
httplib.HTTPConnection.debuglevel = 1
|
httplib.HTTPConnection.debuglevel = 1
|
||||||
|
|
||||||
import cgitb
|
import cgitb
|
||||||
|
@ -63,9 +67,9 @@ if 'REQUEST_URI' in os.environ:
|
||||||
class MorssException(Exception):
|
class MorssException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def log(txt):
|
def log(txt, force=False):
|
||||||
if DEBUG:
|
if DEBUG or force:
|
||||||
if HOLD:
|
if 'REQUEST_URI' in os.environ:
|
||||||
open('morss.log', 'a').write("%s\n" % repr(txt))
|
open('morss.log', 'a').write("%s\n" % repr(txt))
|
||||||
else:
|
else:
|
||||||
print repr(txt)
|
print repr(txt)
|
||||||
|
@ -84,16 +88,21 @@ def countWord(txt):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
class ParseOptions:
|
class ParseOptions:
|
||||||
def __init__(self):
|
def __init__(self, environ=False):
|
||||||
self.url = ''
|
self.url = ''
|
||||||
self.options = {}
|
self.options = {}
|
||||||
roptions = []
|
roptions = []
|
||||||
|
|
||||||
if 'REQUEST_URI' in os.environ:
|
if environ:
|
||||||
self.url = os.environ['REQUEST_URI'][1:]
|
if 'REQUEST_URI' in environ:
|
||||||
|
self.url = environ['REQUEST_URI'][1:]
|
||||||
|
else:
|
||||||
|
self.url = environ['PATH_INFO'][1:]
|
||||||
|
|
||||||
if 'REDIRECT_URL' not in os.environ:
|
if self.url.startswith('/morss.py'):
|
||||||
self.url = self.url[len(os.environ['SCRIPT_NAME']):]
|
self.url = self.url[10:]
|
||||||
|
elif self.url.startswith('morss.py'):
|
||||||
|
self.url = self.url[9:]
|
||||||
|
|
||||||
if self.url.startswith(':'):
|
if self.url.startswith(':'):
|
||||||
roptions = self.url.split('/')[0].split(':')[1:]
|
roptions = self.url.split('/')[0].split(':')[1:]
|
||||||
|
@ -475,7 +484,7 @@ def Fill(item, cache, feedurl='/', fast=False):
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def Gather(url, cachePath, options):
|
def Init(url, cachePath, options):
|
||||||
# url clean up
|
# url clean up
|
||||||
log(url)
|
log(url)
|
||||||
|
|
||||||
|
@ -492,6 +501,9 @@ def Gather(url, cachePath, options):
|
||||||
cache = Cache(cachePath, url, options.proxy)
|
cache = Cache(cachePath, url, options.proxy)
|
||||||
log(cache._hash)
|
log(cache._hash)
|
||||||
|
|
||||||
|
return (url, cache)
|
||||||
|
|
||||||
|
def Fetch(url, cache, options):
|
||||||
# do some useful facebook work
|
# do some useful facebook work
|
||||||
feedify.PreWorker(url, cache)
|
feedify.PreWorker(url, cache)
|
||||||
|
|
||||||
|
@ -545,7 +557,7 @@ def Gather(url, cachePath, options):
|
||||||
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||||
if len(match):
|
if len(match):
|
||||||
link = urlparse.urljoin(url, match[0])
|
link = urlparse.urljoin(url, match[0])
|
||||||
return Gather(link, cachePath, options)
|
return Fetch(link, cachePath, options)
|
||||||
else:
|
else:
|
||||||
log('no-link html')
|
log('no-link html')
|
||||||
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
|
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
|
||||||
|
@ -553,14 +565,35 @@ def Gather(url, cachePath, options):
|
||||||
log('random page')
|
log('random page')
|
||||||
raise MorssException('Link provided is not a valid feed')
|
raise MorssException('Link provided is not a valid feed')
|
||||||
|
|
||||||
|
|
||||||
|
cache.save()
|
||||||
|
return rss
|
||||||
|
|
||||||
|
def Gather(rss, url, cache, options):
|
||||||
|
log('YEAH')
|
||||||
|
|
||||||
size = len(rss.items)
|
size = len(rss.items)
|
||||||
startTime = time.time()
|
startTime = time.time()
|
||||||
|
|
||||||
# custom settings
|
# custom settings
|
||||||
|
global LIM_ITEM
|
||||||
|
global LIM_TIME
|
||||||
|
global MAX_ITEM
|
||||||
|
global MAX_TIME
|
||||||
|
|
||||||
if options.progress:
|
if options.progress:
|
||||||
MAX_TIME = -1
|
MAX_TIME = -1
|
||||||
|
LIM_TIME = 15
|
||||||
|
MAX_ITEM = -1
|
||||||
|
LIM_ITEM = -1
|
||||||
if options.cache:
|
if options.cache:
|
||||||
MAX_TIME = 0
|
MAX_TIME = 0
|
||||||
|
if options.OFCOURSENOT:
|
||||||
|
log('welcome home')
|
||||||
|
LIM_ITEM = -1
|
||||||
|
LIM_TIME = -1
|
||||||
|
MAX_ITEM = -1
|
||||||
|
MAX_TIME = -1
|
||||||
|
|
||||||
# set
|
# set
|
||||||
def runner(queue):
|
def runner(queue):
|
||||||
|
@ -601,14 +634,6 @@ def Gather(url, cachePath, options):
|
||||||
if not options.keep:
|
if not options.keep:
|
||||||
del item.desc
|
del item.desc
|
||||||
|
|
||||||
if options.progress:
|
|
||||||
end = size if MAX_ITEM == -1 else min(MAX_ITEM, size)
|
|
||||||
if options.json:
|
|
||||||
sys.stdout.write(json.dumps((i+1, end, item), default=lambda o: dict(o)) + "\n")
|
|
||||||
else:
|
|
||||||
sys.stdout.write("%s/%s\n" % (i+1, end))
|
|
||||||
sys.stdout.flush()
|
|
||||||
|
|
||||||
queue = Queue.Queue()
|
queue = Queue.Queue()
|
||||||
|
|
||||||
for i in range(THREADS):
|
for i in range(THREADS):
|
||||||
|
@ -627,26 +652,93 @@ def Gather(url, cachePath, options):
|
||||||
|
|
||||||
return rss
|
return rss
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def cgi_app(environ, start_response):
|
||||||
|
options = ParseOptions(environ)
|
||||||
|
url = options.url
|
||||||
|
headers = {}
|
||||||
|
|
||||||
|
global DEBUG
|
||||||
|
DEBUG = options.debug
|
||||||
|
|
||||||
|
if 'HTTP_IF_NONE_MATCH' in environ:
|
||||||
|
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
|
||||||
|
headers['status'] = '304 Not Modified'
|
||||||
|
start_response(headers['status'], headers.items())
|
||||||
|
log(url)
|
||||||
|
log('etag good')
|
||||||
|
return []
|
||||||
|
|
||||||
|
headers['status'] = '200 OK'
|
||||||
|
headers['etag'] = '"%s"' % int(time.time())
|
||||||
|
|
||||||
|
if options.html:
|
||||||
|
headers['content-type'] = 'text/html'
|
||||||
|
elif options.debug or options.txt:
|
||||||
|
headers['content-type'] = 'text/plain'
|
||||||
|
elif options.json:
|
||||||
|
headers['content-type'] = 'application/json'
|
||||||
|
else:
|
||||||
|
headers['content-type'] = 'text/xml'
|
||||||
|
|
||||||
|
url, cache = Init(url, os.getcwd() + '/cache', options)
|
||||||
|
RSS = Fetch(url, cache, options)
|
||||||
|
RSS = Gather(RSS, url, cache, options)
|
||||||
|
|
||||||
|
if headers['content-type'] == 'text/xml':
|
||||||
|
headers['content-type'] = RSS.mimetype
|
||||||
|
|
||||||
|
start_response(headers['status'], headers.items())
|
||||||
|
|
||||||
|
if not DEBUG and not options.silent:
|
||||||
|
if options.json:
|
||||||
|
if options.indent:
|
||||||
|
return json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
|
||||||
|
else:
|
||||||
|
return json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
|
||||||
|
else:
|
||||||
|
return RSS.tostring(xml_declaration=True, encoding='UTF-8')
|
||||||
|
|
||||||
|
log('done')
|
||||||
|
|
||||||
|
def cgi_wrapper(environ, start_response):
|
||||||
|
try:
|
||||||
|
return cgi_app(environ, start_response)
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
raise
|
||||||
|
except MorssException as e:
|
||||||
|
headers = {}
|
||||||
|
headers['status'] = '500 Oops'
|
||||||
|
headers['content-type'] = 'text/plain'
|
||||||
|
start_response(headers['status'], headers.items(), sys.exc_info())
|
||||||
|
return 'Internal Error: %s' % e.message
|
||||||
|
except Exception as e:
|
||||||
|
headers = {}
|
||||||
|
headers['status'] = '500 Oops'
|
||||||
|
headers['content-type'] = 'text/plain'
|
||||||
|
start_response(headers['status'], headers.items(), sys.exc_info())
|
||||||
|
return 'Unknown Error: %s' % e.message
|
||||||
|
|
||||||
|
def cli_app():
|
||||||
options = ParseOptions()
|
options = ParseOptions()
|
||||||
url = options.url
|
url = options.url
|
||||||
|
|
||||||
DEBUG = bool(options.debug)
|
global DEBUG
|
||||||
|
DEBUG = options.debug
|
||||||
|
|
||||||
if 'REQUEST_URI' in os.environ:
|
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
|
||||||
HOLD = True
|
RSS = Fetch(url, cache, options)
|
||||||
|
RSS = Gather(RSS, url, cache, options)
|
||||||
|
|
||||||
if 'HTTP_IF_NONE_MATCH' in os.environ:
|
if not DEBUG and not options.silent:
|
||||||
if not options.force and not options.facebook and time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
|
if options.json:
|
||||||
print 'Status: 304'
|
if options.indent:
|
||||||
print
|
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
|
||||||
log(url)
|
|
||||||
log('etag good')
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
cachePath = os.getcwd() + '/cache'
|
|
||||||
else:
|
else:
|
||||||
cachePath = os.path.expanduser('~') + '/.cache/morss'
|
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
|
||||||
|
else:
|
||||||
|
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
|
||||||
|
|
||||||
|
log('done')
|
||||||
|
|
||||||
if options.facebook:
|
if options.facebook:
|
||||||
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
|
facebook = Cache(cachePath, 'facebook', persistent=True, dic=True)
|
||||||
|
@ -685,37 +777,23 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
def main():
|
||||||
if 'REQUEST_URI' in os.environ:
|
if 'REQUEST_URI' in os.environ:
|
||||||
print 'Status: 200'
|
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
|
||||||
print 'ETag: "%s"' % int(time.time())
|
|
||||||
|
elif len(sys.argv) <= 1:
|
||||||
|
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
|
||||||
|
httpd.serve_forever()
|
||||||
|
|
||||||
if options.html:
|
|
||||||
print 'Content-Type: text/html'
|
|
||||||
elif options.debug or options.txt:
|
|
||||||
print 'Content-Type: text/plain'
|
|
||||||
elif options.progress:
|
|
||||||
print 'Content-Type: application/octet-stream'
|
|
||||||
elif options.json:
|
|
||||||
print 'Content-Type: application/json'
|
|
||||||
else:
|
else:
|
||||||
print 'Content-Type: text/xml'
|
try:
|
||||||
print ''
|
cli_app()
|
||||||
|
except (KeyboardInterrupt, SystemExit):
|
||||||
|
raise
|
||||||
|
except MorssException as e:
|
||||||
|
print 'Internal Error: %s' % e.message
|
||||||
|
except Exception as e:
|
||||||
|
print 'Unknown Error: %s' % e.message
|
||||||
|
|
||||||
HOLD = False
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
RSS = Gather(url, cachePath, options)
|
|
||||||
|
|
||||||
if RSS is not False and not options.progress and not DEBUG and not options.silent:
|
|
||||||
if options.json:
|
|
||||||
if options.indent:
|
|
||||||
print json.dumps(RSS, sort_keys=True, indent=4, default=lambda x: dict(x))
|
|
||||||
else:
|
|
||||||
print json.dumps(RSS, sort_keys=True, default=lambda x: dict(x))
|
|
||||||
else:
|
|
||||||
print RSS.tostring(xml_declaration=True, encoding='UTF-8')
|
|
||||||
|
|
||||||
if RSS is False and 'progress' not in options:
|
|
||||||
print 'Error fetching feed.'
|
|
||||||
|
|
||||||
log('done')
|
|
||||||
|
|
Loading…
Reference in New Issue