import sys import os import os.path import time from datetime import datetime from dateutil import tz import threading from fnmatch import fnmatch import re import lxml.etree import lxml.html from . import feeds from . import crawler from . import readabilite import wsgiref.simple_server import wsgiref.handlers import cgitb try: # python 2 from Queue import Queue from httplib import HTTPException from urllib import unquote from urlparse import urlparse, urljoin, parse_qs except ImportError: # python 3 from queue import Queue from http.client import HTTPException from urllib.parse import unquote from urllib.parse import urlparse, urljoin, parse_qs LIM_ITEM = 100 # deletes what's beyond LIM_TIME = 7 # deletes what's after MAX_ITEM = 50 # cache-only beyond MAX_TIME = 7 # cache-only after (in sec) DELAY = 10 * 60 # xml cache & ETag cache (in sec) TIMEOUT = 4 # http timeout (in sec) THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False PORT = 8080 PROTOCOL = ['http', 'https'] def filterOptions(options): return options # example of filtering code below #allowed = ['proxy', 'clip', 'cache', 'force', 'silent', 'pro', 'debug'] #filtered = dict([(key,value) for (key,value) in options.items() if key in allowed]) #return filtered class MorssException(Exception): pass def log(txt, force=False): if DEBUG or force: if 'REQUEST_URI' in os.environ: open('morss.log', 'a').write("%s\n" % repr(txt)) else: print(repr(txt)) def len_html(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content()) else: return 0 def count_words(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content().split()) return 0 class Options: def __init__(self, options=None, **args): if len(args): self.options = args self.options.update(options or {}) else: self.options = options or {} def __getattr__(self, key): if key in self.options: return self.options[key] else: return False def __setitem__(self, key, value): self.options[key] = value def __contains__(self, key): return key in self.options def parseOptions(options): """ Turns ['md=True'] into {'md':True} """ out = {} for option in options: split = option.split('=', 1) if len(split) > 1: if split[0].lower() == 'true': out[split[0]] = True elif split[0].lower() == 'false': out[split[0]] = False else: out[split[0]] = split[1] else: out[split[0]] = True return out def ItemFix(item, feedurl='/'): """ Improves feed items (absolute links, resolve feedburner links, etc) """ # check unwanted uppercase title if len(item.title) > 20 and item.title.isupper(): item.title = item.title.title() # check if it includes link if not item.link: log('no link') return item # wikipedia daily highlight if fnmatch(feedurl, 'http*://*.wikipedia.org/w/api.php?*&feedformat=atom'): match = lxml.html.fromstring(item.desc).xpath('//b/a/@href') if len(match): item.link = match[0] log(item.link) # check relative urls item.link = urljoin(feedurl, item.link) # google translate if fnmatch(item.link, 'http://translate.google.*/translate*u=*'): item.link = parse_qs(urlparse(item.link).query)['u'][0] log(item.link) # google if fnmatch(item.link, 'http://www.google.*/url?q=*'): item.link = parse_qs(urlparse(item.link).query)['q'][0] log(item.link) # google news if fnmatch(item.link, 'http://news.google.com/news/url*url=*'): item.link = parse_qs(urlparse(item.link).query)['url'][0] log(item.link) # pocket if fnmatch(item.link, 'https://getpocket.com/redirect?url=*'): item.link = parse_qs(urlparse(item.link).query)['url'][0] log(item.link) # facebook if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): item.link = parse_qs(urlparse(item.link).query)['u'][0] log(item.link) # feedburner FIXME only works if RSS... item.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0' match = item.rule_str('feedburner:origLink') if match: item.link = match # feedsportal match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) if match: url = match.groups()[0].split('0') t = {'A': '0', 'B': '.', 'C': '/', 'D': '?', 'E': '-', 'F': '=', 'G': '&', 'H': ',', 'I': '_', 'J': '%', 'K': '+', 'L': 'http://', 'M': 'https://', 'N': '.com', 'O': '.co.uk', 'P': ';', 'Q': '|', 'R': ':', 'S': 'www.', 'T': '#', 'U': '$', 'V': '~', 'W': '!', 'X': '(', 'Y': ')', 'Z': 'Z'} item.link = ''.join([(t[s[0]] if s[0] in t else s[0]) + s[1:] for s in url[1:]]) log(item.link) # reddit if urlparse(feedurl).netloc == 'www.reddit.com': match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href') if len(match): item.link = match[0] log(item.link) return item def ItemFill(item, options, feedurl='/', fast=False): """ Returns True when it has done its best """ if not item.link: log('no link') return item log(item.link) link = item.link # twitter if urlparse(feedurl).netloc == 'twitter.com': match = lxml.html.fromstring(item.desc).xpath('//a/@data-expanded-url') if len(match): link = match[0] log(link) else: link = None # facebook if urlparse(feedurl).netloc == 'graph.facebook.com': match = lxml.html.fromstring(item.content).xpath('//a/@href') if len(match) and urlparse(match[0]).netloc != 'www.facebook.com': link = match[0] log(link) else: link = None if link is None: log('no used link') return True # download delay = -1 if fast: # super-fast mode delay = -2 try: data, con, contenttype, encoding = crawler.adv_get(url=link, delay=delay, timeout=TIMEOUT) except (IOError, HTTPException) as e: log('http error') return False # let's just delete errors stuff when in cache mode if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain': log('non-text page') return True out = readabilite.get_article(data, url=con.geturl(), encoding=encoding) if out is not None: item.content = out return True def ItemBefore(item, options): # return None if item deleted if options.search: if options.search not in item.title: item.remove() return None return item def ItemAfter(item, options): if options.clip and item.desc and item.content: item.content = item.desc + "

* * *


" + item.content del item.desc if options.nolink and item.content: content = lxml.html.fromstring(item.content) for link in content.xpath('//a'): log(link.text_content()) link.drop_tag() item.content = lxml.etree.tostring(content) if options.noref: item.link = '' return item def UrlFix(url): if url is None: raise MorssException('No url provided') if isinstance(url, bytes): url = url.decode() if urlparse(url).scheme not in PROTOCOL: url = 'http://' + url log(url) url = url.replace(' ', '%20') return url def FeedFetch(url, options): # fetch feed delay = DELAY if options.theforce: delay = 0 try: xml, con, contenttype, encoding = crawler.adv_get(url=url, follow='rss', delay=delay, timeout=TIMEOUT * 2) except (IOError, HTTPException): raise MorssException('Error downloading feed') if options.items: # using custom rules rss = feeds.FeedHTML(xml, encoding=encoding) rss.rules['title'] = options.title if options.title else '//head/title' rss.rules['desc'] = options.desc if options.desc else '//head/meta[@name="description"]/@content' rss.rules['items'] = options.items rss.rules['item_title'] = options.item_title if options.item_title else './/a|.' rss.rules['item_link'] = options.item_link if options.item_link else './@href|.//a/@href' if options.item_content: rss.rules['item_content'] = options.item_content if options.item_time: rss.rules['item_time'] = options.item_time rss = rss.convert(feeds.FeedXML) else: try: rss = feeds.parse(xml, url, contenttype, encoding=encoding) rss = rss.convert(feeds.FeedXML) # contains all fields, otherwise much-needed data can be lost except TypeError: log('random page') log(contenttype) raise MorssException('Link provided is not a valid feed') return rss def FeedGather(rss, url, options): size = len(rss.items) start_time = time.time() # custom settings lim_item = LIM_ITEM lim_time = LIM_TIME max_item = MAX_ITEM max_time = MAX_TIME threads = THREADS if options.cache: max_time = 0 if options.mono: threads = 1 # set def runner(queue): while True: value = queue.get() try: worker(*value) except Exception as e: log('Thread Error: %s' % e.message) queue.task_done() def worker(i, item): if time.time() - start_time > lim_time >= 0 or i + 1 > lim_item >= 0: log('dropped') item.remove() return item = ItemBefore(item, options) if item is None: return item = ItemFix(item, url) if time.time() - start_time > max_time >= 0 or i + 1 > max_item >= 0: if not options.proxy: if ItemFill(item, options, url, True) is False: item.remove() return else: if not options.proxy: ItemFill(item, options, url) item = ItemAfter(item, options) queue = Queue() for i in range(threads): t = threading.Thread(target=runner, args=(queue,)) t.daemon = True t.start() now = datetime.now(tz.tzutc()) sorted_items = sorted(rss.items, key=lambda x:x.updated or x.time or now, reverse=True) for i, item in enumerate(sorted_items): if threads == 1: worker(*[i, item]) else: queue.put([i, item]) if threads != 1: queue.join() if options.ad: new = rss.items.append() new.title = "Are you hungry?" new.desc = "Eat some Galler chocolate :)" new.link = "http://www.galler.com/" new.time = "5 Oct 2013 22:42" log(len(rss.items)) log(time.time() - start_time) return rss def FeedFormat(rss, options, encoding='utf-8'): if options.callback: if re.match(r'^[a-zA-Z0-9\.]+$', options.callback) is not None: out = '%s(%s)' % (options.callback, rss.tojson(encoding='unicode')) return out if encoding == 'unicode' else out.encode(encoding) else: raise MorssException('Invalid callback var name') elif options.json: if options.indent: return rss.tojson(encoding=encoding, indent=4) else: return rss.tojson(encoding=encoding) elif options.csv: return rss.tocsv(encoding=encoding) elif options.html: if options.indent: return rss.tohtml(encoding=encoding, pretty_print=True) else: return rss.tohtml(encoding=encoding) else: if options.indent: return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding, pretty_print=True) else: return rss.torss(xml_declaration=(not encoding == 'unicode'), encoding=encoding) def process(url, cache=None, options=None): if not options: options = [] options = Options(options) if cache: crawler.default_cache = crawler.SQLiteCache(cache) url = UrlFix(url) rss = FeedFetch(url, options) rss = FeedGather(rss, url, options) return FeedFormat(rss, options) def cgi_parse_environ(environ): # get options if 'REQUEST_URI' in environ: url = environ['REQUEST_URI'][1:] else: url = environ['PATH_INFO'][1:] if environ['QUERY_STRING']: url += '?' + environ['QUERY_STRING'] url = re.sub(r'^/?(cgi/)?(morss.py|main.py)/', '', url) if url.startswith(':'): split = url.split('/', 1) raw_options = unquote(split[0]).replace('|', '/').replace('\\\'', '\'').split(':')[1:] if len(split) > 1: url = split[1] else: url = '' else: raw_options = [] # init options = Options(filterOptions(parseOptions(raw_options))) global DEBUG DEBUG = options.debug return (url, options) def cgi_app(environ, start_response): url, options = cgi_parse_environ(environ) headers = {} # headers headers['status'] = '200 OK' headers['cache-control'] = 'max-age=%s' % DELAY if options.cors: headers['access-control-allow-origin'] = '*' if options.html: headers['content-type'] = 'text/html' elif options.txt or options.silent: headers['content-type'] = 'text/plain' elif options.json: headers['content-type'] = 'application/json' elif options.callback: headers['content-type'] = 'application/javascript' elif options.csv: headers['content-type'] = 'text/csv' headers['content-disposition'] = 'attachment; filename="feed.csv"' else: headers['content-type'] = 'text/xml' headers['content-type'] += '; charset=utf-8' crawler.default_cache = crawler.SQLiteCache(os.path.join(os.getcwd(), 'morss-cache.db')) # get the work done url = UrlFix(url) rss = FeedFetch(url, options) if headers['content-type'] == 'text/xml': headers['content-type'] = rss.mimetype[0] start_response(headers['status'], list(headers.items())) rss = FeedGather(rss, url, options) out = FeedFormat(rss, options) if options.silent: return [''] else: return [out] def middleware(func): " Decorator to turn a function into a wsgi middleware " # This is called when parsing the code def app_builder(app): # This is called when doing app = cgi_wrapper(app) def app_wrap(environ, start_response): # This is called when a http request is being processed return func(environ, start_response, app) return app_wrap return app_builder @middleware def cgi_file_handler(environ, start_response, app): " Simple HTTP server to serve static files (.html, .css, etc.) " files = { '': 'text/html', 'index.html': 'text/html', 'sheet.xsl': 'text/xsl'} if 'REQUEST_URI' in environ: url = environ['REQUEST_URI'][1:] else: url = environ['PATH_INFO'][1:] if url in files: headers = {} if url == '': url = 'index.html' if '--root' in sys.argv[1:]: path = os.path.join(sys.argv[-1], url) else: path = url try: body = open(path, 'rb').read() headers['status'] = '200 OK' headers['content-type'] = files[url] start_response(headers['status'], list(headers.items())) return [body] except IOError: headers['status'] = '404 Not found' start_response(headers['status'], list(headers.items())) return ['Error %s' % headers['status']] else: return app(environ, start_response) def cgi_get(environ, start_response): url, options = cgi_parse_environ(environ) # get page PROTOCOL = ['http', 'https'] if urlparse(url).scheme not in ['http', 'https']: url = 'http://' + url data, con, contenttype, encoding = crawler.adv_get(url=url) if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']: if options.get == 'page': html = readabilite.parse(data, encoding=encoding) html.make_links_absolute(con.geturl()) kill_tags = ['script', 'iframe', 'noscript'] for tag in kill_tags: for elem in html.xpath('//'+tag): elem.getparent().remove(elem) output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') elif options.get == 'article': output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug) else: raise MorssException('no :get option passed') else: output = data # return html page headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8'} start_response(headers['status'], list(headers.items())) return [output] dispatch_table = { 'get': cgi_get, } @middleware def cgi_dispatcher(environ, start_response, app): url, options = cgi_parse_environ(environ) for key in dispatch_table.keys(): if key in options: return dispatch_table[key](environ, start_response) return app(environ, start_response) @middleware def cgi_error_handler(environ, start_response, app): try: return app(environ, start_response) except (KeyboardInterrupt, SystemExit): raise except Exception as e: headers = {'status': '500 Oops', 'content-type': 'text/html'} start_response(headers['status'], list(headers.items()), sys.exc_info()) log('ERROR: %s' % repr(e), force=True) return [cgitb.html(sys.exc_info())] @middleware def cgi_encode(environ, start_response, app): out = app(environ, start_response) return [x if isinstance(x, bytes) else str(x).encode('utf-8') for x in out] cgi_standalone_app = cgi_encode(cgi_error_handler(cgi_dispatcher(cgi_file_handler(cgi_app)))) def cli_app(): options = Options(filterOptions(parseOptions(sys.argv[1:-1]))) url = sys.argv[-1] global DEBUG DEBUG = options.debug crawler.default_cache = crawler.SQLiteCache(os.path.expanduser('~/.cache/morss-cache.db')) url = UrlFix(url) rss = FeedFetch(url, options) rss = FeedGather(rss, url, options) out = FeedFormat(rss, options, 'unicode') if not options.silent: print(out) log('done') def isInt(string): try: int(string) return True except ValueError: return False def main(): if 'REQUEST_URI' in os.environ: # mod_cgi app = cgi_app app = cgi_dispatcher(app) app = cgi_error_handler(app) app = cgi_encode(app) wsgiref.handlers.CGIHandler().run(app) elif len(sys.argv) <= 1 or isInt(sys.argv[1]) or '--root' in sys.argv[1:]: # start internal (basic) http server if len(sys.argv) > 1 and isInt(sys.argv[1]): argPort = int(sys.argv[1]) if argPort > 0: port = argPort else: raise MorssException('Port must be positive integer') else: port = PORT app = cgi_app app = cgi_file_handler(app) app = cgi_dispatcher(app) app = cgi_error_handler(app) app = cgi_encode(app) print('Serving http://localhost:%s/' % port) httpd = wsgiref.simple_server.make_server('', port, app) httpd.serve_forever() else: # as a CLI app try: cli_app() except (KeyboardInterrupt, SystemExit): raise except Exception as e: print('ERROR: %s' % e.message) if __name__ == '__main__': main()