morss/morss/morss.py

839 lines
21 KiB
Python

#!/usr/bin/env python
import sys
import os
import os.path
import time
import Queue
import threading
from fnmatch import fnmatch
import re
import json
import lxml.html
import feeds
import feedify
import httplib
import urllib
import urllib2
import chardet
import urlparse
import wsgiref.simple_server
import wsgiref.handlers
from gzip import GzipFile
from StringIO import StringIO
from readability import readability
from html2text import HTML2Text
LIM_ITEM = 100 # deletes what's beyond
LIM_TIME = 7 # deletes what's after
MAX_ITEM = 50 # cache-only beyond
MAX_TIME = 7 # cache-only after (in sec)
DELAY = 10*60 # xml cache & ETag cache (in sec)
TIMEOUT = 2 # http timeout (in sec)
THREADS = 10 # number of threads (1 for single-threaded)
DEBUG = False
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
FBAPPID = "<insert yours>"
FBSECRET = "<insert yours>"
FBAPPTOKEN = FBAPPID + '|' + FBSECRET
PROTOCOL = ['http', 'https', 'ftp']
if 'SCRIPT_NAME' in os.environ:
httplib.HTTPConnection.debuglevel = 1
import cgitb
cgitb.enable()
class MorssException(Exception):
pass
def log(txt, force=False):
if DEBUG or force:
if 'REQUEST_URI' in os.environ:
open('morss.log', 'a').write("%s\n" % repr(txt))
else:
print repr(txt)
def lenHTML(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content())
else:
return 0
def countWord(txt):
if len(txt):
return len(lxml.html.fromstring(txt).text_content().split())
else:
return 0
class Options:
def __init__(self, options=None):
self.options = options or []
def __getattr__(self, key):
return key in self.options
def __setitem__(self, key, value):
self.options[key] = value
def __contains__(self, key):
return key in self.options
class Cache:
""" Light, error-prone caching system. """
def __init__(self, folder=None, key='cache', lifespan=10*24*3600):
self._key = key
self._dir = folder
self._lifespan = lifespan
self._cache = {}
if self._dir is None:
self._hash = "NO CACHE"
return
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1 - 4 # ".tmp"
self._hash = urllib.quote_plus(self._key)[:maxsize]
self._file = self._dir + '/' + self._hash
self._file_tmp = self._file + '.tmp'
if os.path.isfile(self._file):
data = open(self._file).read()
if data:
self._cache = json.loads(data)
def __del__(self):
self.save()
def __contains__(self, key):
return key in self._cache
def get(self, key):
if key in self._cache:
self._cache[key]['last'] = time.time()
return self._cache[key]['value']
else:
return None
def set(self, key, content):
self._cache[key] = {'last': time.time(), 'value': content}
__getitem__ = get
__setitem__ = set
def save(self):
if len(self._cache) == 0 or self._dir is None:
return
if not os.path.exists(self._dir):
os.makedirs(self._dir)
for i in self._cache:
if time.time() - self._cache[i]['last'] > self._lifespan > -1:
del self._cache[i]
out = json.dumps(self._cache, indent=4)
try:
open(self._file_tmp, 'w+').write(out)
os.rename(self._file_tmp, self._file)
except IOError:
log('failed to write cache to tmp file')
except OSError:
log('failed to move cache to file')
def last(self, key):
if key not in self._cache:
return -1
return self._cache[key]['last']
def age(self, key):
if key not in self._cache:
return -1
return time.time() - self.last(key)
def new(self, *arg, **karg):
""" Returns a Cache object in the same directory """
if arg[0] != self._key:
return Cache(self._dir, *arg, **karg)
else:
return self
class SimpleDownload(urllib2.HTTPCookieProcessor):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, accept=None, strict=False):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent
self.decode = decode
self.accept = accept
self.strict = strict
def http_request(self, req):
urllib2.HTTPCookieProcessor.http_request(self, req)
req.add_unredirected_header('Accept-Encoding', 'gzip')
req.add_unredirected_header('User-Agent', self.useragent)
if req.get_host() != 'feeds.feedburner.com':
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
if self.accept is not None:
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
out = {}
rank = 1.1
for group in self.accept:
rank = rank - 0.1
if isinstance(group, basestring):
if group in MIMETYPE:
group = MIMETYPE[group]
else:
out[group] = rank
continue
for mime in group:
if mime not in out:
out[mime] = rank
if not self.strict:
out['*/*'] = rank-0.1
string = ','.join([x+';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
req.add_unredirected_header('Accept', string)
return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
def http_response(self, req, resp):
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
odata = data = resp.read()
if 200 <= resp.code < 300:
# gzip
if resp.headers.get('Content-Encoding') == 'gzip':
log('un-gzip')
data = GzipFile(fileobj=StringIO(data), mode='r').read()
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
# <meta> redirect
if resp.info().type in MIMETYPE['html']:
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# encoding
enc = detEncoding(data, resp)
if enc:
data = data.decode(enc, 'replace')
if not self.decode:
data = data.encode(enc)
fp = StringIO(data)
old_resp = resp
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def detEncoding(data, con=None):
if con is not None and con.headers.getparam('charset'):
log('header')
return con.headers.getparam('charset')
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
log('meta.re')
return match.groups()[0]
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
if match:
return match.groups()[0].lower()
return None
def Fix(item, feedurl='/'):
""" Improves feed items (absolute links, resolve feedburner links, etc) """
# check unwanted uppercase title
if len(item.title) > 20 and item.title.isupper():
item.title = item.title.title()
# check if it includes link
if not item.link:
log('no link')
return item
# wikipedia daily highlight
if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'):
match = lxml.html.fromstring(item.desc).xpath('//b/a/@href')
if len(match):
item.link = match[0]
log(item.link)
# check relative urls
item.link = urlparse.urljoin(feedurl, item.link)
# google translate
if fnmatch(item.link, 'http://translate.google.*/translate*u=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
log(item.link)
# google
if fnmatch(item.link, 'http://www.google.*/url?q=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
log(item.link)
# google news
if fnmatch(item.link, 'http://news.google.com/news/url*url=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['url'][0]
log(item.link)
# facebook
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
log(item.link)
# feedburner
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
match = item.xval('feedburner:origLink')
if match:
item.link = match
# feedsportal
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
if match:
url = match.groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
log(item.link)
# reddit
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
if len(match):
item.link = match[0]
log(item.link)
return item
def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """
if not item.link:
log('no link')
return item
log(item.link)
# content already provided?
count_content = countWord(item.content)
count_desc = countWord(item.desc)
if max(count_content, count_desc) > 500:
if count_desc > count_content:
item.content = item.desc
del item.desc
log('reversed sizes')
log('long enough')
return True
if count_content > 5*count_desc > 0 and count_content > 50:
log('content bigger enough')
return True
link = item.link
# twitter
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
if len(match):
link = match[0]
log(link)
else:
link = None
# facebook
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
match = lxml.html.fromstring(item.content).xpath('//a/@href')
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
link = match[0]
log(link)
else:
link = None
if link is None:
log('no used link')
return True
# check cache and previous errors
if link in cache:
content = cache.get(link)
match = re.search(r'^error-([a-z]{2,10})$', content)
if match:
if cache.age(link) > DELAY:
log('cached error: %s' % match.groups()[0])
return True
else:
log('old error')
else:
log('cached')
item.pushContent(cache.get(link))
return True
# super-fast mode
if fast:
log('skipped')
return False
# download
try:
url = link.encode('utf-8')
con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
data = con.read()
except (IOError, httplib.HTTPException) as e:
log('http error: %s' % e.message)
cache.set(link, 'error-http')
return True
if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
log('non-text page')
cache.set(link, 'error-type')
return True
out = readability.Document(data, url=con.url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0:
item.pushContent(out)
cache.set(link, out)
else:
log('not bigger enough')
cache.set(link, 'error-length')
return True
return True
def Init(url, cachePath, options):
# url clean up
log(url)
if url is None:
raise MorssException('No url provided')
if urlparse.urlparse(url).scheme not in PROTOCOL:
url = 'http://' + url
log(url)
url = url.replace(' ', '%20')
# cache
cache = Cache(cachePath, url)
log(cache._hash)
return (url, cache)
def Fetch(url, cache, options):
# do some useful facebook work
feedify.PreWorker(url, cache)
if 'redirect' in cache:
url = cache.get('redirect')
log('url redirect')
log(url)
if 'cache' in cache:
cache.redirect(cache.get('cache'))
log('cache redirect')
# fetch feed
if not options.theforce and 'xml' in cache and cache.age('xml') < DELAY and 'style' in cache:
log('xml cached')
xml = cache.get('xml')
style = cache.get('style')
else:
try:
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), accept=('xml','html'))
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
xml = con.read()
except (IOError, httplib.HTTPException):
raise MorssException('Error downloading feed')
cache.set('xml', xml)
cache.set('etag', con.headers.getheader('etag'))
cache.set('lastmodified', con.headers.getheader('last-modified'))
if url.startswith('https://itunes.apple.com/lookup?id='):
style = 'itunes'
elif xml.startswith('<?xml') or con.info().type in MIMETYPE['xml']:
style = 'normal'
elif feedify.supported(url):
style = 'feedify'
elif con.info().type in MIMETYPE['html']:
style = 'html'
else:
style = 'none'
log(con.info().type)
cache.set('style', style)
log(style)
if style == 'itunes':
link = json.loads(xml)['results'][0]['feedUrl']
log('itunes redirect: %s' % link)
return Fetch(link, cache.new(link), options)
elif style == 'normal':
rss = feeds.parse(xml)
elif style == 'feedify':
feed = feedify.Builder(url, xml, cache)
feed.build()
rss = feed.feed
elif style == 'html':
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
if len(match):
link = urlparse.urljoin(url, match[0])
log('rss redirect: %s' % link)
return Fetch(link, cache.new(link), options)
else:
log('no-link html')
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
else:
log('random page')
raise MorssException('Link provided is not a valid feed')
cache.save()
return rss
def Gather(rss, url, cache, options):
size = len(rss.items)
startTime = time.time()
# custom settings
lim_item = LIM_ITEM
lim_time = LIM_TIME
max_item = MAX_ITEM
max_time = MAX_TIME
if options.cache:
max_time = 0
# set
def runner(queue):
while True:
value = queue.get()
try:
worker(*value)
except Exception as e:
log('Thread Error: %s' % e.message)
queue.task_done()
def worker(i, item):
if time.time() - startTime > lim_time >= 0 or i+1 > lim_item >= 0:
log('dropped')
item.remove()
return
item = Fix(item, url)
if time.time() - startTime > max_time >= 0 or i+1 > max_item >= 0:
if not options.proxy:
if Fill(item, cache, url, True) is False:
item.remove()
return
else:
if not options.proxy:
Fill(item, cache, url)
queue = Queue.Queue()
for i in range(THREADS):
t = threading.Thread(target=runner, args=(queue,))
t.daemon = True
t.start()
for i, item in enumerate(rss.items):
queue.put([i, item])
queue.join()
cache.save()
log(len(rss.items))
log(time.time() - startTime)
return rss
def After(rss, options):
for i, item in enumerate(rss.items):
if item.desc and item.content:
if options.clip:
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
del item.desc
if not options.keep:
del item.desc
if options.md:
conv = HTML2Text(baseurl=item.link)
conv.unicode_snob = True
if item.desc:
item.desc = conv.handle(item.desc)
if item.content:
item.content = conv.handle(item.content)
if options.json:
if options.indent:
return rss.tojson(indent=4)
else:
return rss.tojson()
elif options.csv:
return rss.tocsv()
elif options.reader:
return rss.tohtml()
else:
return rss.tostring(xml_declaration=True, encoding='UTF-8')
def process(url, cache=None, options=None):
if options == None:
options = []
options = Options(options)
url, cache = Init(url, cache, options)
rss = Fetch(url, cache, options)
rss = Gather(rss, url, cache, options)
return After(rss, options)
def cgi_app(environ, start_response):
# get options
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
url = re.sub(r'^/?morss.py/', '', url)
if url.startswith(':'):
options = url.split('/')[0].split(':')[1:]
url = url.split('/', 1)[1]
else:
options = []
# init
options = Options(options)
headers = {}
global DEBUG
DEBUG = options.debug
if 'HTTP_IF_NONE_MATCH' in environ:
if not options.force and not options.facebook and time.time() - int(environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
headers['status'] = '304 Not Modified'
start_response(headers['status'], headers.items())
log(url)
log('etag good')
return []
# headers
headers['status'] = '200 OK'
headers['etag'] = '"%s"' % int(time.time())
if options.html or options.reader:
headers['content-type'] = 'text/html'
elif options.debug or options.txt:
headers['content-type'] = 'text/plain'
elif options.json:
headers['content-type'] = 'application/json'
elif options.csv:
headers['content-type'] = 'text/csv'
headers['content-disposition'] = 'attachment; filename="feed.csv"'
else:
headers['content-type'] = 'text/xml'
url, cache = Init(url, os.getcwd() + '/cache', options)
if options.facebook:
doFacebook(url, environ, headers, options, cache)
start_response(headers['status'], headers.items())
return
# get the work done
RSS = Fetch(url, cache, options)
if headers['content-type'] == 'text/xml':
headers['content-type'] = RSS.mimetype
start_response(headers['status'], headers.items())
RSS = Gather(RSS, url, cache, options)
if not DEBUG and not options.silent:
return After(RSS, options)
log('done')
def cgi_wrapper(environ, start_response):
# simple http server for html and css
files = {
'': 'text/html',
'index.html': 'text/html'}
if 'REQUEST_URI' in environ:
url = environ['REQUEST_URI'][1:]
else:
url = environ['PATH_INFO'][1:]
if url in files:
headers = {}
if url == '':
url = 'index.html'
if os.path.isfile(url):
headers['status'] = '200 OK'
headers['content-type'] = files[url]
start_response(headers['status'], headers.items())
return open(url, 'rb').read()
else:
headers['status'] = '404 Not found'
start_response(headers['status'], headers.items())
return ''
# actual morss use
try:
return cgi_app(environ, start_response) or []
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Internal Error: %s' % e.message
except Exception as e:
headers = {}
headers['status'] = '500 Oops'
headers['content-type'] = 'text/plain'
start_response(headers['status'], headers.items(), sys.exc_info())
return 'Unknown Error: %s' % e.message
def cli_app():
options = Options(sys.argv[1:-1])
url = sys.argv[-1]
global DEBUG
DEBUG = options.debug
url, cache = Init(url, os.path.expanduser('~/.cache/morss'), options)
RSS = Fetch(url, cache, options)
RSS = Gather(RSS, url, cache, options)
if not DEBUG and not options.silent:
print After(RSS, options)
log('done')
def doFacebook(url, environ, headers, options, cache):
log('fb stuff')
query = urlparse.urlparse(url).query
if 'code' in query:
# get real token from code
code = urlparse.parse_qs(query)['code'][0]
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri="http://morss.it/:facebook/")
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
# get long-lived access token
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
ltoken = values['access_token'][0]
expires = int(time.time() + int(values['expires'][0]))
headers['set-cookie'] = 'token={token}; Path=/'.format(token=ltoken)
# headers
headers['status'] = '303 See Other'
headers['location'] = 'http://{domain}/'.format(domain=environ['SERVER_NAME'])
log('fb done')
return
def main():
if 'REQUEST_URI' in os.environ:
wsgiref.handlers.CGIHandler().run(cgi_wrapper)
elif len(sys.argv) <= 1:
httpd = wsgiref.simple_server.make_server('', 8080, cgi_wrapper)
httpd.serve_forever()
else:
try:
cli_app()
except (KeyboardInterrupt, SystemExit):
raise
except MorssException as e:
print 'Internal Error: %s' % e.message
except Exception as e:
print 'Unknown Error: %s' % e.message
if __name__ == '__main__':
main()