#!/usr/bin/env python import sys import os import os.path import time from fnmatch import fnmatch from base64 import b64encode, b64decode import re import string import lxml.html import lxml.html.clean import lxml.builder import feeds import feedify import httplib import urllib2 import socket import chardet import urlparse from gzip import GzipFile from StringIO import StringIO from readability import readability LIM_ITEM = 100 # deletes what's beyond MAX_ITEM = 50 # cache-only beyond MAX_TIME = 7 # cache-only after (in sec) DELAY = 10*60 # xml cache & ETag cache (in sec) TIMEOUT = 2 # http timeout (in sec) DEBUG = False UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11' MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'html': ['text/html', 'application/xhtml+xml']} PROTOCOL = ['http', 'https', 'ftp'] if 'REQUEST_URI' in os.environ: httplib.HTTPConnection.debuglevel = 1 import cgitb cgitb.enable() def log(txt): if DEBUG: print repr(txt) def lenHTML(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content()) else: return 0 def countWord(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content().split()) else: return 0 def parseOptions(): url = '' options = [] if 'REQUEST_URI' in os.environ: url = os.environ['REQUEST_URI'][1:] if 'REDIRECT_URL' not in os.environ: url = url[len(os.environ['SCRIPT_NAME']):] if url.startswith(':'): options = url.split('/')[0].split(':')[1:] url = url.split('/', 1)[1] if urlparse.urlparse(url).scheme not in PROTOCOL: url = 'http://' + url else: if len(sys.argv) <= 1: return (None, []) options = sys.argv[1:-1] url = sys.argv[-1] if urlparse.urlparse(url).scheme not in PROTOCOL: url = 'http://' + url return (url, options) class Cache: """Light, error-prone caching system.""" def __init__(self, folder, key): self._key = key self._hash = str(hash(self._key)) self._dir = folder self._file = self._dir + '/' + self._hash self._cached = {} # what *was* cached self._cache = {} # new things to put in cache if os.path.isfile(self._file): data = open(self._file).readlines() for line in data: if "\t" in line: key, bdata = line.split("\t", 1) self._cached[key] = bdata def __del__(self): self.save() def __contains__(self, key): return key in self._cached def get(self, key): if key in self._cached: self._cache[key] = self._cached[key] return b64decode(self._cached[key]) else: return None def set(self, key, content): self._cache[key] = b64encode(content or '') def save(self): if len(self._cache) == 0: return out = [] for (key, bdata) in self._cache.iteritems(): out.append(str(key) + "\t" + bdata) out.append("_key\t" + self._key) txt = "\n".join(out) if not os.path.exists(self._dir): os.makedirs(self._dir) with open(self._file, 'w') as file: file.write(txt) def isYoungerThan(self, sec): if not os.path.exists(self._file): return False return time.time() - os.path.getmtime(self._file) < sec class HTMLDownloader(urllib2.HTTPCookieProcessor): """ Custom urllib2 handler to download html pages, following redirects, using a browser user-agent and storing cookies. """ def __init__(self, useragent=UA_HTML, cookiejar=None): urllib2.HTTPCookieProcessor.__init__(self, cookiejar) self.useragent = useragent def http_request(self, req): urllib2.HTTPCookieProcessor.http_request(self, req) req.add_unredirected_header('Accept-Encoding', 'gzip') req.add_unredirected_header('User-Agent', self.useragent) return req def http_response(self, req, resp): urllib2.HTTPCookieProcessor.http_response(self, req, resp) if 200 <= resp.code < 300 and resp.info().maintype == 'text': data = resp.read() # gzip if resp.headers.get('Content-Encoding') == 'gzip': log('un-gzip') data = GzipFile(fileobj=StringIO(data), mode='r').read() # redirect if resp.info().type in MIMETYPE['html']: match = re.search(r'(?i)]*?url=(http.*?)["\']', data) if match: newurl = match.groups()[0] log('redirect: %s' % newurl) newheaders = dict((k,v) for k,v in req.headers.items() if k.lower() not in ('content-length', 'content-type')) new = urllib2.Request(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(), unverifiable=True) return self.parent.open(new, timeout=req.timeout) # decode data = decodeHTML(data, resp) fp = StringIO(data) old_resp = resp resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp https_response = http_response https_request = http_request class CacheDownload(urllib2.BaseHandler): """ Custom urllib2 handler to download a page, using etag/last-modified headers, to save bandwidth. The given headers are added back into the header on error 304 for easier use. """ def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_RSS): self.cache = cache self.etag = etag self.lastmodified = lastmodified self.useragent = useragent def http_request(self, req): req.add_unredirected_header('User-Agent', self.useragent) if self.cache: if self.etag: req.add_unredirected_header('If-None-Match', self.etag) if self.lastmodified: req.add_unredirected_header('If-Modified-Since', self.lastmodified) return req def http_error_304(self, req, fp, code, msg, headers): log('http cached') if self.etag: headers.addheader('etag', self.etag) if self.lastmodified: headers.addheader('last-modified', self.lastmodified) resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200) return resp https_request = http_request def decodeHTML(data, con=None): if con is not None and con.headers.getparam('charset'): log('header') enc = con.headers.getparam('charset') else: match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data) if match: log('meta.re') enc = match.groups()[0] else: log('chardet') enc = chardet.detect(data)['encoding'] log(enc) return data.decode(enc, 'replace') def Fill(item, cache, feedurl='/', fast=False): """ Returns True when it has done its best """ if not item.link: log('no link') return True log(item.link) # check relative urls item.link = urlparse.urljoin(feedurl, item.link) # google if fnmatch(item.link, 'http://www.google.com/url?q=*'): item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0] log(item.link) # facebook if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'): item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0] log(item.link) # feedburner feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0' match = item.xval('feedburner:origLink') if match: item.link = match log(item.link) # feedsportal match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link) if match: url = match.groups()[0].split('0') t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'} item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]]) log(item.link) # reddit if urlparse.urlparse(item.link).netloc == 'www.reddit.com': match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href') if len(match): item.link = match[0] log(item.link) # check unwanted uppercase title if len(item.title) > 20 and item.title.isupper(): item.title = item.title.title() # content already provided? count_content = countWord(item.content) count_desc = countWord(item.desc) if max(count_content, count_desc) > 500: if count_desc > count_content: item.content = item.desc del item.desc log('reversed sizes') log('long enough') return True if count_content > 5*count_desc > 0 and count_content > 50: log('content bigger enough') return True link = item.link # twitter if urlparse.urlparse(item.link).netloc == 'twitter.com': match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url') if len(match): link = match[0] log(link) else: link = None if link is None: log('no used link') return True # check cache and previous errors if link in cache: content = cache.get(link) match = re.search(r'^error-([a-z]{2,10})$', content) if match: if cache.isYoungerThan(DELAY): log('cached error: %s' % match.groups()[0]) return True else: log('old error') else: log('cached') item.pushContent(cache.get(link)) return True # super-fast mode if fast: log('skipped') return False # download try: url = link.encode('utf-8') con = urllib2.build_opener(HTMLDownloader()).open(url, timeout=TIMEOUT) data = con.read() except (urllib2.URLError, httplib.HTTPException, socket.timeout): log('http error') cache.set(link, 'error-http') return True if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain': log('non-text page') cache.set(link, 'error-type') return True out = readability.Document(data, url=con.url).summary(True) if countWord(out) > max(count_content, count_desc) > 0: item.pushContent(out) cache.set(link, out) else: log('not bigger enough') cache.set(link, 'error-length') return True return True def Gather(url, cachePath, options): log(url) url = url.replace(' ', '%20') cache = Cache(cachePath, url) log(cache._hash) # fetch feed if cache.isYoungerThan(DELAY) and 'xml' in cache and 'style' in cache: log('xml cached') xml = cache.get('xml') style = cache.get('style') else: try: opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), useragent=UA_HTML) con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT) xml = con.read() except (urllib2.URLError, httplib.HTTPException, socket.timeout): return False cache.set('xml', xml) cache.set('etag', con.headers.getheader('etag')) cache.set('lastmodified', con.headers.getheader('last-modified')) if xml[:5] == ' LIM_ITEM > 0: item.remove() continue elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0: if Fill(item, cache, url, True) is False: item.remove() continue else: Fill(item, cache, url) if item.desc and item.content: if 'clip' in options: item.content = item.desc + "

* * *

" + item.content del item.desc if 'keep' not in options: del item.desc log(len(rss.items)) log(time.time() - startTime) return rss.tostring(xml_declaration=True, encoding='UTF-8') if __name__ == '__main__': url, options = parseOptions() DEBUG = 'debug' in options if 'REQUEST_URI' in os.environ: if 'HTTP_IF_NONE_MATCH' in os.environ and 'force' not in options: if time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY: print 'Status: 304' print log(url) log('etag good') sys.exit(0) print 'Status: 200' print 'ETag: "%s"' % int(time.time()) if 'html' in options: print 'Content-Type: text/html' elif 'debug' in options: print 'Content-Type: text/plain' elif 'progress' in options: print 'Content-Type: application/octet-stream' else: print 'Content-Type: text/xml' print cache = os.getcwd() + '/cache' else: cache = os.path.expanduser('~') + '/.cache/morss' if url is None: print 'Please provide url.' sys.exit(1) if 'progress' in options: MAX_TIME = -1 if 'cache' in options: MAX_TIME = 0 RSS = Gather(url, cache, options) if RSS is not False and 'progress' not in options and not DEBUG: print RSS if RSS is False and 'progress' not in options: print 'Error fetching feed.' log('done')