2013-04-04 15:56:37 +00:00
|
|
|
#!/usr/bin/env python
|
2013-02-25 14:50:32 +00:00
|
|
|
import sys
|
2013-02-25 17:01:59 +00:00
|
|
|
import os
|
2013-04-15 16:51:55 +00:00
|
|
|
import os.path
|
2013-04-22 22:00:07 +00:00
|
|
|
import time
|
2013-04-22 22:04:44 +00:00
|
|
|
|
2013-11-24 19:52:53 +00:00
|
|
|
import Queue
|
|
|
|
import threading
|
|
|
|
|
2013-10-01 17:49:53 +00:00
|
|
|
from fnmatch import fnmatch
|
2013-04-22 22:04:44 +00:00
|
|
|
from base64 import b64encode, b64decode
|
|
|
|
import re
|
|
|
|
import string
|
2013-11-30 18:59:19 +00:00
|
|
|
import json
|
2013-04-22 22:04:44 +00:00
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
import lxml.html
|
|
|
|
import lxml.html.clean
|
|
|
|
import lxml.builder
|
2013-04-22 22:04:44 +00:00
|
|
|
|
2013-07-14 16:44:11 +00:00
|
|
|
import feeds
|
2013-09-25 10:36:21 +00:00
|
|
|
import feedify
|
2013-07-14 16:44:11 +00:00
|
|
|
|
2013-09-25 09:10:16 +00:00
|
|
|
import httplib
|
2013-12-01 14:42:35 +00:00
|
|
|
import urllib
|
2013-02-25 14:50:32 +00:00
|
|
|
import urllib2
|
2013-05-05 13:31:11 +00:00
|
|
|
import socket
|
2013-04-04 15:43:30 +00:00
|
|
|
import chardet
|
2013-05-15 15:12:59 +00:00
|
|
|
import urlparse
|
2013-04-04 15:43:30 +00:00
|
|
|
|
2013-07-16 21:33:45 +00:00
|
|
|
from gzip import GzipFile
|
|
|
|
from StringIO import StringIO
|
|
|
|
|
2013-04-19 09:37:43 +00:00
|
|
|
from readability import readability
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-05-15 15:56:58 +00:00
|
|
|
LIM_ITEM = 100 # deletes what's beyond
|
2013-11-16 15:02:35 +00:00
|
|
|
LIM_TIME = 7 # deletes what's after
|
2013-05-15 15:56:58 +00:00
|
|
|
MAX_ITEM = 50 # cache-only beyond
|
2013-08-24 21:40:37 +00:00
|
|
|
MAX_TIME = 7 # cache-only after (in sec)
|
2013-09-15 13:45:15 +00:00
|
|
|
DELAY = 10*60 # xml cache & ETag cache (in sec)
|
2013-08-24 21:40:37 +00:00
|
|
|
TIMEOUT = 2 # http timeout (in sec)
|
2013-11-24 19:52:53 +00:00
|
|
|
THREADS = 10 # number of threads (1 for single-threaded)
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-09-15 13:38:03 +00:00
|
|
|
DEBUG = False
|
2013-11-03 12:24:27 +00:00
|
|
|
HOLD = False
|
2013-05-05 13:30:06 +00:00
|
|
|
|
2013-05-01 15:54:17 +00:00
|
|
|
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
|
2013-09-25 09:11:11 +00:00
|
|
|
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
2013-05-01 15:54:17 +00:00
|
|
|
|
2013-09-25 10:32:40 +00:00
|
|
|
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
|
|
|
'html': ['text/html', 'application/xhtml+xml']}
|
|
|
|
|
2013-11-09 17:48:06 +00:00
|
|
|
FBAPPID = "<insert yours>"
|
|
|
|
FBSECRET = "<insert yours>"
|
|
|
|
FBAPPTOKEN = FBAPPID + '|' + FBSECRET
|
|
|
|
|
2013-05-15 15:12:59 +00:00
|
|
|
PROTOCOL = ['http', 'https', 'ftp']
|
|
|
|
|
2013-04-22 22:00:07 +00:00
|
|
|
if 'REQUEST_URI' in os.environ:
|
2013-04-04 15:43:30 +00:00
|
|
|
httplib.HTTPConnection.debuglevel = 1
|
|
|
|
|
|
|
|
import cgitb
|
|
|
|
cgitb.enable()
|
2013-02-25 17:01:59 +00:00
|
|
|
|
2013-02-25 20:36:02 +00:00
|
|
|
def log(txt):
|
2013-09-15 13:38:03 +00:00
|
|
|
if DEBUG:
|
2013-11-03 12:24:27 +00:00
|
|
|
if HOLD:
|
|
|
|
open('morss.log', 'a').write("%s\n" % repr(txt))
|
|
|
|
else:
|
|
|
|
print repr(txt)
|
2013-04-15 16:51:55 +00:00
|
|
|
|
|
|
|
|
2013-05-15 15:24:27 +00:00
|
|
|
def lenHTML(txt):
|
2013-06-08 15:30:11 +00:00
|
|
|
if len(txt):
|
|
|
|
return len(lxml.html.fromstring(txt).text_content())
|
|
|
|
else:
|
|
|
|
return 0
|
2013-05-15 15:24:27 +00:00
|
|
|
|
2013-07-14 16:57:12 +00:00
|
|
|
def countWord(txt):
|
|
|
|
if len(txt):
|
|
|
|
return len(lxml.html.fromstring(txt).text_content().split())
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
class ParseOptions:
|
|
|
|
def __init__(self):
|
|
|
|
self.url = ''
|
|
|
|
self.options = {}
|
|
|
|
roptions = []
|
2013-09-15 13:56:08 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
if 'REQUEST_URI' in os.environ:
|
|
|
|
self.url = os.environ['REQUEST_URI'][1:]
|
|
|
|
|
|
|
|
if 'REDIRECT_URL' not in os.environ:
|
|
|
|
self.url = self.url[len(os.environ['SCRIPT_NAME']):]
|
2013-09-15 13:38:03 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
if self.url.startswith(':'):
|
|
|
|
roptions = self.url.split('/')[0].split(':')[1:]
|
|
|
|
self.url = self.url.split('/', 1)[1]
|
|
|
|
else:
|
|
|
|
if len(sys.argv) <= 1:
|
2013-11-30 18:30:54 +00:00
|
|
|
return
|
2013-09-15 13:38:03 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
roptions = sys.argv[1:-1]
|
|
|
|
self.url = sys.argv[-1]
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
if urlparse.urlparse(self.url).scheme not in PROTOCOL:
|
|
|
|
self.url = 'http://' + self.url
|
2013-09-15 13:38:03 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
for option in roptions:
|
|
|
|
split = option.split('=', 1)
|
|
|
|
if len(split) > 1:
|
|
|
|
if split[0].lower() == 'true':
|
|
|
|
self.options[split[0]] = True
|
|
|
|
if split[0].lower() == 'false':
|
|
|
|
self.options[split[0]] = False
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
self.options[split[0]] = split[1]
|
|
|
|
else:
|
|
|
|
self.options[split[0]] = True
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
def __getattr__(self, key):
|
|
|
|
if key in self.options:
|
|
|
|
return self.options[key]
|
|
|
|
else:
|
|
|
|
return False
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
class Cache:
|
2013-11-09 17:38:02 +00:00
|
|
|
""" Light, error-prone caching system. """
|
|
|
|
def __init__(self, folder, key, persistent=False):
|
2013-04-15 16:51:55 +00:00
|
|
|
self._key = key
|
|
|
|
self._dir = folder
|
2013-12-01 14:42:35 +00:00
|
|
|
|
|
|
|
maxsize = os.statvfs('./').f_namemax - len(self._dir) - 1
|
|
|
|
self._hash = urllib.quote_plus(self._key)[:maxsize]
|
|
|
|
|
2013-07-14 17:00:16 +00:00
|
|
|
self._file = self._dir + '/' + self._hash
|
2013-05-15 15:48:39 +00:00
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
self._cached = {} # what *was* cached
|
|
|
|
self._cache = {} # new things to put in cache
|
|
|
|
|
2013-06-08 15:30:53 +00:00
|
|
|
if os.path.isfile(self._file):
|
2013-11-30 18:59:19 +00:00
|
|
|
data = open(self._file).read()
|
|
|
|
self._cached = json.loads(data)
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-11-30 18:59:19 +00:00
|
|
|
if persistent:
|
|
|
|
self._cache = self._cached
|
2013-11-09 17:38:02 +00:00
|
|
|
|
2013-04-19 09:40:35 +00:00
|
|
|
def __del__(self):
|
|
|
|
self.save()
|
|
|
|
|
2013-04-22 22:00:07 +00:00
|
|
|
def __contains__(self, key):
|
2013-11-03 19:18:43 +00:00
|
|
|
return key in self._cache or key in self._cached
|
2013-04-22 22:00:07 +00:00
|
|
|
|
2013-12-01 14:46:02 +00:00
|
|
|
def get(self, key):
|
2013-11-03 19:18:43 +00:00
|
|
|
if key in self._cache:
|
2013-12-01 14:46:02 +00:00
|
|
|
return self._cache[key]
|
2013-11-03 19:18:43 +00:00
|
|
|
elif key in self._cached:
|
2013-04-22 22:00:07 +00:00
|
|
|
self._cache[key] = self._cached[key]
|
2013-12-01 14:46:02 +00:00
|
|
|
return self._cached[key]
|
2013-04-04 15:43:30 +00:00
|
|
|
else:
|
2013-04-15 16:51:55 +00:00
|
|
|
return None
|
|
|
|
|
2013-04-19 09:40:35 +00:00
|
|
|
def set(self, key, content):
|
2013-12-01 14:46:02 +00:00
|
|
|
self._cache[key] = content
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-04-19 09:40:35 +00:00
|
|
|
def save(self):
|
2013-04-22 20:56:38 +00:00
|
|
|
if len(self._cache) == 0:
|
|
|
|
return
|
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
if not os.path.exists(self._dir):
|
|
|
|
os.makedirs(self._dir)
|
|
|
|
|
2013-12-01 14:45:21 +00:00
|
|
|
with open(self._file, 'w+') as file:
|
2013-12-01 14:44:19 +00:00
|
|
|
file.write(json.dumps(self._cache, indent=4))
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-04-22 22:00:07 +00:00
|
|
|
def isYoungerThan(self, sec):
|
|
|
|
if not os.path.exists(self._file):
|
|
|
|
return False
|
|
|
|
|
2013-05-15 15:48:39 +00:00
|
|
|
return time.time() - os.path.getmtime(self._file) < sec
|
2013-04-22 22:00:07 +00:00
|
|
|
|
2013-11-09 17:38:02 +00:00
|
|
|
def new(self, key, persistent=False):
|
2013-11-03 12:26:56 +00:00
|
|
|
""" Returns a Cache object in the same directory """
|
|
|
|
if key != self._key:
|
2013-11-09 17:38:02 +00:00
|
|
|
return Cache(self._dir, key, persistent)
|
2013-11-03 12:26:56 +00:00
|
|
|
else:
|
|
|
|
return self
|
|
|
|
|
2013-11-09 17:38:02 +00:00
|
|
|
def redirect(self, key, persistent=False):
|
|
|
|
return self.__init__(self._dir, key, persistent)
|
|
|
|
|
2013-10-21 19:17:52 +00:00
|
|
|
class SimpleDownload(urllib2.HTTPCookieProcessor):
|
2013-07-16 21:33:45 +00:00
|
|
|
"""
|
2013-10-21 19:17:52 +00:00
|
|
|
Custom urllib2 handler to download a page, using etag/last-modified headers,
|
|
|
|
to save bandwidth. The given headers are added back into the header on error
|
|
|
|
304 for easier use.
|
2013-07-16 21:33:45 +00:00
|
|
|
"""
|
2013-11-30 18:33:36 +00:00
|
|
|
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None, accept=None, strict=False):
|
2013-07-16 21:33:45 +00:00
|
|
|
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
|
2013-10-21 19:17:52 +00:00
|
|
|
self.cache = cache
|
|
|
|
self.etag = etag
|
|
|
|
self.lastmodified = lastmodified
|
2013-07-17 12:40:29 +00:00
|
|
|
self.useragent = useragent
|
2013-10-21 19:17:52 +00:00
|
|
|
self.decode = decode
|
2013-11-30 18:33:36 +00:00
|
|
|
self.accept = accept
|
|
|
|
self.strict = strict
|
2013-07-16 21:33:45 +00:00
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
urllib2.HTTPCookieProcessor.http_request(self, req)
|
2013-07-17 12:41:29 +00:00
|
|
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
|
|
|
req.add_unredirected_header('User-Agent', self.useragent)
|
2013-11-24 19:43:09 +00:00
|
|
|
if req.get_host() != 'feeds.feedburner.com':
|
|
|
|
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
|
|
|
|
2013-10-21 19:17:52 +00:00
|
|
|
if self.cache:
|
|
|
|
if self.etag:
|
|
|
|
req.add_unredirected_header('If-None-Match', self.etag)
|
|
|
|
if self.lastmodified:
|
|
|
|
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
2013-11-30 18:33:36 +00:00
|
|
|
|
|
|
|
if self.accept is not None:
|
|
|
|
# req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
|
|
if isinstance(self.accept, basestring):
|
|
|
|
self.accept = (self.accept,)
|
|
|
|
|
|
|
|
out = {}
|
|
|
|
for (i, group) in enumerate(self.accept):
|
|
|
|
rank = 1 - i*0.1
|
|
|
|
|
|
|
|
if isinstance(group, basestring):
|
|
|
|
if group in MIMETYPE:
|
|
|
|
group = MIMETYPE[group]
|
|
|
|
else:
|
|
|
|
out[group] = rank
|
|
|
|
continue
|
|
|
|
|
|
|
|
for mime in group:
|
|
|
|
if mime not in out:
|
|
|
|
out[mime] = rank
|
|
|
|
|
|
|
|
if not self.strict:
|
|
|
|
out['*/*'] = rank-0.1
|
|
|
|
|
|
|
|
string = ','.join([x+';q={:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
|
|
|
log(string)
|
|
|
|
|
|
|
|
req.add_unredirected_header('Accept', string)
|
|
|
|
|
2013-07-16 21:33:45 +00:00
|
|
|
return req
|
|
|
|
|
2013-10-21 19:17:52 +00:00
|
|
|
def http_error_304(self, req, fp, code, msg, headers):
|
|
|
|
log('http cached')
|
|
|
|
if self.etag:
|
|
|
|
headers.addheader('etag', self.etag)
|
|
|
|
if self.lastmodified:
|
|
|
|
headers.addheader('last-modified', self.lastmodified)
|
|
|
|
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
|
|
|
return resp
|
|
|
|
|
2013-07-16 21:33:45 +00:00
|
|
|
def http_response(self, req, resp):
|
|
|
|
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
|
2013-11-24 20:55:07 +00:00
|
|
|
odata = data = resp.read()
|
2013-07-16 21:33:45 +00:00
|
|
|
|
2013-11-24 20:55:07 +00:00
|
|
|
if 200 <= resp.code < 300:
|
2013-07-16 21:33:45 +00:00
|
|
|
# gzip
|
|
|
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
|
|
|
log('un-gzip')
|
|
|
|
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
|
|
|
|
2013-11-24 20:55:07 +00:00
|
|
|
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
2013-07-16 21:33:45 +00:00
|
|
|
# <meta> redirect
|
2013-09-25 10:32:40 +00:00
|
|
|
if resp.info().type in MIMETYPE['html']:
|
2013-09-15 13:33:14 +00:00
|
|
|
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
|
|
|
if match:
|
|
|
|
newurl = match.groups()[0]
|
|
|
|
log('redirect: %s' % newurl)
|
|
|
|
|
|
|
|
newheaders = dict((k,v) for k,v in req.headers.items()
|
|
|
|
if k.lower() not in ('content-length', 'content-type'))
|
|
|
|
new = urllib2.Request(newurl,
|
|
|
|
headers=newheaders,
|
|
|
|
origin_req_host=req.get_origin_req_host(),
|
|
|
|
unverifiable=True)
|
|
|
|
|
|
|
|
return self.parent.open(new, timeout=req.timeout)
|
2013-07-16 21:33:45 +00:00
|
|
|
|
|
|
|
# decode
|
2013-10-21 19:17:52 +00:00
|
|
|
if self.decode:
|
|
|
|
data = decodeHTML(data, resp)
|
2013-07-16 21:33:45 +00:00
|
|
|
|
2013-11-30 18:34:34 +00:00
|
|
|
fp = StringIO(data)
|
|
|
|
old_resp = resp
|
|
|
|
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
2013-07-16 21:33:45 +00:00
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
https_request = http_request
|
|
|
|
|
2013-09-25 09:08:58 +00:00
|
|
|
def decodeHTML(data, con=None):
|
|
|
|
if con is not None and con.headers.getparam('charset'):
|
2013-04-15 16:51:55 +00:00
|
|
|
log('header')
|
|
|
|
enc = con.headers.getparam('charset')
|
|
|
|
else:
|
2013-04-30 17:51:29 +00:00
|
|
|
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data)
|
|
|
|
if match:
|
2013-04-15 16:51:55 +00:00
|
|
|
log('meta.re')
|
2013-04-30 17:51:29 +00:00
|
|
|
enc = match.groups()[0]
|
2013-04-15 16:51:55 +00:00
|
|
|
else:
|
|
|
|
log('chardet')
|
|
|
|
enc = chardet.detect(data)['encoding']
|
|
|
|
|
2013-06-08 15:32:55 +00:00
|
|
|
log(enc)
|
2013-11-16 15:00:07 +00:00
|
|
|
return data.decode(enc, 'replace') if enc else data
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-10-22 18:55:24 +00:00
|
|
|
def Fix(item, feedurl='/'):
|
|
|
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
|
|
|
|
|
|
|
# check unwanted uppercase title
|
|
|
|
if len(item.title) > 20 and item.title.isupper():
|
|
|
|
item.title = item.title.title()
|
2013-05-15 15:38:52 +00:00
|
|
|
|
2013-10-22 18:55:24 +00:00
|
|
|
# check if it includes link
|
2013-07-14 16:44:11 +00:00
|
|
|
if not item.link:
|
2013-04-22 22:24:41 +00:00
|
|
|
log('no link')
|
2013-10-22 18:55:24 +00:00
|
|
|
return item
|
2013-07-14 16:44:11 +00:00
|
|
|
|
2013-09-25 09:49:45 +00:00
|
|
|
# check relative urls
|
|
|
|
item.link = urlparse.urljoin(feedurl, item.link)
|
|
|
|
|
2013-10-01 17:49:53 +00:00
|
|
|
# google
|
|
|
|
if fnmatch(item.link, 'http://www.google.com/url?q=*'):
|
|
|
|
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['q'][0]
|
|
|
|
log(item.link)
|
|
|
|
|
|
|
|
# facebook
|
|
|
|
if fnmatch(item.link, 'https://www.facebook.com/l.php?u=*'):
|
|
|
|
item.link = urlparse.parse_qs(urlparse.urlparse(item.link).query)['u'][0]
|
|
|
|
log(item.link)
|
|
|
|
|
2013-06-11 11:02:16 +00:00
|
|
|
# feedburner
|
2013-07-14 16:44:11 +00:00
|
|
|
feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
|
2013-11-15 15:54:41 +00:00
|
|
|
match = item.xval('feedburner:origLink')
|
|
|
|
if match:
|
|
|
|
item.link = match
|
2013-05-01 15:43:43 +00:00
|
|
|
|
2013-06-11 11:02:16 +00:00
|
|
|
# feedsportal
|
2013-04-19 09:44:25 +00:00
|
|
|
match = re.search('/([0-9a-zA-Z]{20,})/story01.htm$', item.link)
|
|
|
|
if match:
|
|
|
|
url = match.groups()[0].split('0')
|
2013-09-15 17:38:59 +00:00
|
|
|
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'H':',', 'I':'_', 'L':'http://', 'S':'www.', 'N':'.com', 'O':'.co.uk'}
|
2013-07-14 17:00:16 +00:00
|
|
|
item.link = ''.join([(t[s[0]] if s[0] in t else '=') + s[1:] for s in url[1:]])
|
2013-04-19 09:44:25 +00:00
|
|
|
log(item.link)
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-06-11 11:02:47 +00:00
|
|
|
# reddit
|
2013-11-30 18:36:44 +00:00
|
|
|
if urlparse.urlparse(feedurl).netloc == 'www.reddit.com':
|
2013-06-11 11:02:47 +00:00
|
|
|
match = lxml.html.fromstring(item.desc).xpath('//a[text()="[link]"]/@href')
|
|
|
|
if len(match):
|
|
|
|
item.link = match[0]
|
|
|
|
log(item.link)
|
|
|
|
|
2013-10-22 18:55:24 +00:00
|
|
|
return item
|
|
|
|
|
|
|
|
def Fill(item, cache, feedurl='/', fast=False):
|
|
|
|
""" Returns True when it has done its best """
|
|
|
|
|
|
|
|
if not item.link:
|
|
|
|
log('no link')
|
|
|
|
return item
|
|
|
|
|
|
|
|
log(item.link)
|
2013-06-02 21:45:41 +00:00
|
|
|
|
2013-05-15 15:32:42 +00:00
|
|
|
# content already provided?
|
2013-07-14 16:57:12 +00:00
|
|
|
count_content = countWord(item.content)
|
|
|
|
count_desc = countWord(item.desc)
|
|
|
|
|
|
|
|
if max(count_content, count_desc) > 500:
|
2013-10-01 17:50:28 +00:00
|
|
|
if count_desc > count_content:
|
|
|
|
item.content = item.desc
|
|
|
|
del item.desc
|
|
|
|
log('reversed sizes')
|
2013-07-14 16:57:12 +00:00
|
|
|
log('long enough')
|
|
|
|
return True
|
|
|
|
|
|
|
|
if count_content > 5*count_desc > 0 and count_content > 50:
|
|
|
|
log('content bigger enough')
|
|
|
|
return True
|
2013-05-15 15:32:42 +00:00
|
|
|
|
2013-09-25 09:51:48 +00:00
|
|
|
link = item.link
|
|
|
|
|
2013-09-25 10:37:14 +00:00
|
|
|
# twitter
|
2013-10-21 19:30:31 +00:00
|
|
|
if urlparse.urlparse(feedurl).netloc == 'twitter.com':
|
2013-09-25 10:37:14 +00:00
|
|
|
match = lxml.html.fromstring(item.content).xpath('//a/@data-expanded-url')
|
|
|
|
if len(match):
|
|
|
|
link = match[0]
|
|
|
|
log(link)
|
2013-09-25 11:47:05 +00:00
|
|
|
else:
|
|
|
|
link = None
|
|
|
|
|
2013-11-09 17:48:06 +00:00
|
|
|
# facebook
|
2013-10-21 19:31:02 +00:00
|
|
|
if urlparse.urlparse(feedurl).netloc == 'graph.facebook.com':
|
2013-11-09 17:48:06 +00:00
|
|
|
match = lxml.html.fromstring(item.content).xpath('//a/@href')
|
|
|
|
if len(match) and urlparse.urlparse(match[0]).netloc != 'www.facebook.com':
|
|
|
|
link = match[0]
|
|
|
|
log(link)
|
|
|
|
else:
|
|
|
|
link = None
|
2013-10-21 19:31:02 +00:00
|
|
|
|
2013-09-25 11:47:05 +00:00
|
|
|
if link is None:
|
|
|
|
log('no used link')
|
|
|
|
return True
|
2013-09-25 10:37:14 +00:00
|
|
|
|
2013-05-01 15:56:03 +00:00
|
|
|
# check cache and previous errors
|
2013-09-25 09:51:48 +00:00
|
|
|
if link in cache:
|
|
|
|
content = cache.get(link)
|
2013-05-15 15:24:27 +00:00
|
|
|
match = re.search(r'^error-([a-z]{2,10})$', content)
|
|
|
|
if match:
|
2013-08-24 21:40:37 +00:00
|
|
|
if cache.isYoungerThan(DELAY):
|
2013-05-15 15:24:27 +00:00
|
|
|
log('cached error: %s' % match.groups()[0])
|
2013-05-15 15:38:52 +00:00
|
|
|
return True
|
2013-05-01 15:56:03 +00:00
|
|
|
else:
|
2013-05-15 15:24:27 +00:00
|
|
|
log('old error')
|
2013-05-01 15:56:03 +00:00
|
|
|
else:
|
|
|
|
log('cached')
|
2013-10-01 17:45:54 +00:00
|
|
|
item.pushContent(cache.get(link))
|
2013-05-15 15:38:52 +00:00
|
|
|
return True
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-05-05 13:30:06 +00:00
|
|
|
# super-fast mode
|
2013-05-15 15:56:58 +00:00
|
|
|
if fast:
|
|
|
|
log('skipped')
|
2013-05-15 15:38:52 +00:00
|
|
|
return False
|
2013-05-05 13:30:06 +00:00
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
# download
|
2013-07-16 21:33:45 +00:00
|
|
|
try:
|
2013-09-25 09:51:48 +00:00
|
|
|
url = link.encode('utf-8')
|
2013-11-30 18:33:36 +00:00
|
|
|
con = urllib2.build_opener(SimpleDownload(decode=True, accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
|
2013-07-16 21:33:45 +00:00
|
|
|
data = con.read()
|
2013-11-30 16:33:57 +00:00
|
|
|
except (IOError, httplib.HTTPException):
|
2013-05-01 15:56:03 +00:00
|
|
|
log('http error')
|
2013-09-25 09:51:48 +00:00
|
|
|
cache.set(link, 'error-http')
|
2013-05-15 15:38:52 +00:00
|
|
|
return True
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-10-01 17:47:06 +00:00
|
|
|
if con.info().type not in MIMETYPE['html'] and con.info().type != 'text/plain':
|
2013-09-10 13:25:55 +00:00
|
|
|
log('non-text page')
|
2013-09-25 09:51:48 +00:00
|
|
|
cache.set(link, 'error-type')
|
2013-09-10 13:25:55 +00:00
|
|
|
return True
|
|
|
|
|
2013-07-16 21:33:45 +00:00
|
|
|
out = readability.Document(data, url=con.url).summary(True)
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-07-14 16:57:12 +00:00
|
|
|
if countWord(out) > max(count_content, count_desc) > 0:
|
2013-10-01 17:45:54 +00:00
|
|
|
item.pushContent(out)
|
2013-09-25 09:51:48 +00:00
|
|
|
cache.set(link, out)
|
2013-05-15 15:24:27 +00:00
|
|
|
else:
|
|
|
|
log('not bigger enough')
|
2013-09-25 09:51:48 +00:00
|
|
|
cache.set(link, 'error-length')
|
2013-05-15 15:38:52 +00:00
|
|
|
return True
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-05-15 15:38:52 +00:00
|
|
|
return True
|
2013-04-15 16:51:55 +00:00
|
|
|
|
2013-09-29 13:32:58 +00:00
|
|
|
def Gather(url, cachePath, options):
|
2013-09-15 16:53:35 +00:00
|
|
|
log(url)
|
|
|
|
|
2013-08-24 21:40:37 +00:00
|
|
|
url = url.replace(' ', '%20')
|
2013-11-10 22:52:04 +00:00
|
|
|
cache = Cache(cachePath, url, options.proxy)
|
2013-04-22 22:00:07 +00:00
|
|
|
|
2013-09-25 09:15:11 +00:00
|
|
|
log(cache._hash)
|
|
|
|
|
2013-11-09 17:48:06 +00:00
|
|
|
# do some useful facebook work
|
|
|
|
feedify.PreWorker(url, cache)
|
|
|
|
|
2013-11-09 17:40:23 +00:00
|
|
|
if 'redirect' in cache:
|
|
|
|
url = cache.get('redirect')
|
|
|
|
log('url redirect')
|
|
|
|
log(url)
|
|
|
|
|
|
|
|
if 'cache' in cache:
|
|
|
|
cache.redirect(cache.get('cache'))
|
|
|
|
log('cache redirect')
|
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
# fetch feed
|
2013-11-03 12:29:35 +00:00
|
|
|
if cache.isYoungerThan(DELAY) and not options.theforce and 'xml' in cache and 'style' in cache:
|
2013-09-25 10:32:40 +00:00
|
|
|
log('xml cached')
|
|
|
|
xml = cache.get('xml')
|
|
|
|
style = cache.get('style')
|
2013-04-15 16:51:55 +00:00
|
|
|
else:
|
2013-04-22 22:00:07 +00:00
|
|
|
try:
|
2013-11-30 18:33:36 +00:00
|
|
|
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False, accept=('xml','html'))
|
2013-09-25 10:32:40 +00:00
|
|
|
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
|
2013-07-18 21:54:13 +00:00
|
|
|
xml = con.read()
|
2013-11-30 16:33:57 +00:00
|
|
|
except (IOError, httplib.HTTPException):
|
2013-09-15 17:19:50 +00:00
|
|
|
return False
|
|
|
|
|
2013-09-25 10:32:40 +00:00
|
|
|
cache.set('xml', xml)
|
|
|
|
cache.set('etag', con.headers.getheader('etag'))
|
|
|
|
cache.set('lastmodified', con.headers.getheader('last-modified'))
|
|
|
|
|
2013-11-30 18:36:15 +00:00
|
|
|
if xml.startswith('<?xml') or con.info().type in MIMETYPE['xml']:
|
2013-09-25 10:32:40 +00:00
|
|
|
style = 'normal'
|
2013-09-25 10:36:21 +00:00
|
|
|
elif feedify.supported(url):
|
|
|
|
style = 'feedify'
|
2013-09-25 10:32:40 +00:00
|
|
|
elif con.info().type in MIMETYPE['html']:
|
|
|
|
style = 'html'
|
2013-09-15 17:19:50 +00:00
|
|
|
else:
|
2013-09-25 10:32:40 +00:00
|
|
|
style = 'none'
|
2013-09-15 17:19:50 +00:00
|
|
|
log(con.info().type)
|
2013-09-25 10:32:40 +00:00
|
|
|
|
|
|
|
cache.set('style', style)
|
|
|
|
|
|
|
|
log(style)
|
|
|
|
|
|
|
|
if style == 'normal':
|
|
|
|
rss = feeds.parse(xml)
|
2013-09-25 10:36:21 +00:00
|
|
|
elif style == 'feedify':
|
2013-11-09 17:48:06 +00:00
|
|
|
feed = feedify.Builder(url, xml, cache)
|
2013-10-21 19:28:43 +00:00
|
|
|
feed.build()
|
|
|
|
rss = feed.feed
|
2013-09-25 10:32:40 +00:00
|
|
|
elif style == 'html':
|
|
|
|
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
|
|
|
if len(match):
|
|
|
|
link = urlparse.urljoin(url, match[0])
|
2013-09-29 13:32:58 +00:00
|
|
|
return Gather(link, cachePath, options)
|
2013-09-25 10:32:40 +00:00
|
|
|
else:
|
|
|
|
log('no-link html')
|
2013-04-22 22:00:07 +00:00
|
|
|
return False
|
2013-09-25 10:32:40 +00:00
|
|
|
else:
|
|
|
|
log('random page')
|
|
|
|
return False
|
2013-04-04 15:43:30 +00:00
|
|
|
|
2013-09-08 13:47:15 +00:00
|
|
|
size = len(rss.items)
|
2013-11-24 19:52:53 +00:00
|
|
|
startTime = time.time()
|
|
|
|
|
2013-04-04 15:43:30 +00:00
|
|
|
|
2013-04-15 16:51:55 +00:00
|
|
|
# set
|
2013-11-24 19:52:53 +00:00
|
|
|
def runner(queue):
|
|
|
|
while True:
|
|
|
|
worker(*queue.get())
|
|
|
|
queue.task_done()
|
|
|
|
|
|
|
|
def worker(i, item):
|
2013-11-03 12:18:09 +00:00
|
|
|
if options.progress:
|
2013-11-24 19:49:14 +00:00
|
|
|
if MAX_ITEM == -1:
|
2013-07-14 17:00:16 +00:00
|
|
|
print '%s/%s' % (i+1, size)
|
2013-05-15 15:56:58 +00:00
|
|
|
else:
|
2013-07-14 17:00:16 +00:00
|
|
|
print '%s/%s' % (i+1, min(MAX_ITEM, size))
|
2013-05-01 15:57:09 +00:00
|
|
|
sys.stdout.flush()
|
2013-05-15 15:56:58 +00:00
|
|
|
|
2013-11-16 15:02:35 +00:00
|
|
|
if time.time() - startTime > LIM_TIME >= 0 or i+1 > LIM_ITEM >= 0:
|
|
|
|
log('dropped')
|
2013-07-14 16:44:11 +00:00
|
|
|
item.remove()
|
2013-11-24 19:52:53 +00:00
|
|
|
return
|
2013-11-24 16:36:38 +00:00
|
|
|
|
|
|
|
item = Fix(item, url)
|
|
|
|
|
|
|
|
if time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM >= 0:
|
2013-11-10 22:52:04 +00:00
|
|
|
if not options.proxy:
|
|
|
|
if Fill(item, cache, url, True) is False:
|
|
|
|
item.remove()
|
2013-11-24 19:52:53 +00:00
|
|
|
return
|
2013-05-15 15:56:58 +00:00
|
|
|
else:
|
2013-11-10 22:52:04 +00:00
|
|
|
if not options.proxy:
|
|
|
|
Fill(item, cache, url)
|
2013-10-01 17:45:54 +00:00
|
|
|
|
|
|
|
if item.desc and item.content:
|
2013-11-03 12:18:09 +00:00
|
|
|
if options.clip:
|
2013-10-02 10:05:52 +00:00
|
|
|
item.content = item.desc + "<br/><br/><center>* * *</center><br/><br/>" + item.content
|
2013-10-01 17:45:54 +00:00
|
|
|
del item.desc
|
2013-11-03 12:18:09 +00:00
|
|
|
if not options.keep:
|
2013-10-01 17:45:54 +00:00
|
|
|
del item.desc
|
2013-05-15 15:56:58 +00:00
|
|
|
|
2013-11-24 19:52:53 +00:00
|
|
|
queue = Queue.Queue()
|
|
|
|
|
|
|
|
for i in range(THREADS):
|
|
|
|
t = threading.Thread(target=runner, args=(queue,))
|
|
|
|
t.daemon = True
|
|
|
|
t.start()
|
|
|
|
|
|
|
|
for i, item in enumerate(rss.items):
|
|
|
|
queue.put([i, item])
|
|
|
|
|
|
|
|
queue.join()
|
|
|
|
cache.save()
|
|
|
|
|
2013-09-08 13:47:15 +00:00
|
|
|
log(len(rss.items))
|
2013-09-15 13:44:25 +00:00
|
|
|
log(time.time() - startTime)
|
2013-04-04 15:43:30 +00:00
|
|
|
|
2013-07-14 16:44:11 +00:00
|
|
|
return rss.tostring(xml_declaration=True, encoding='UTF-8')
|
2013-04-04 15:43:30 +00:00
|
|
|
|
2013-07-14 17:00:16 +00:00
|
|
|
if __name__ == '__main__':
|
2013-11-03 12:18:09 +00:00
|
|
|
options = ParseOptions()
|
|
|
|
url = options.url
|
|
|
|
|
|
|
|
DEBUG = bool(options.debug)
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-06-25 11:13:23 +00:00
|
|
|
if 'REQUEST_URI' in os.environ:
|
2013-11-03 12:24:27 +00:00
|
|
|
HOLD = True
|
|
|
|
|
2013-11-09 17:43:16 +00:00
|
|
|
if 'HTTP_IF_NONE_MATCH' in os.environ:
|
2013-11-09 17:48:06 +00:00
|
|
|
if not options.force and not options.facebook and time.time() - int(os.environ['HTTP_IF_NONE_MATCH'][1:-1]) < DELAY:
|
2013-08-24 21:43:32 +00:00
|
|
|
print 'Status: 304'
|
|
|
|
print
|
2013-09-15 13:38:03 +00:00
|
|
|
log(url)
|
|
|
|
log('etag good')
|
2013-08-24 21:43:32 +00:00
|
|
|
sys.exit(0)
|
|
|
|
|
2013-11-10 23:34:32 +00:00
|
|
|
cachePath = os.getcwd() + '/cache'
|
2013-11-10 22:55:44 +00:00
|
|
|
else:
|
|
|
|
cachePath = os.path.expanduser('~') + '/.cache/morss'
|
2013-11-09 17:43:16 +00:00
|
|
|
|
2013-11-09 17:48:06 +00:00
|
|
|
if options.facebook:
|
|
|
|
facebook = Cache(cachePath, 'facebook', True)
|
|
|
|
|
|
|
|
# get real token from code
|
|
|
|
code = urlparse.parse_qs(urlparse.urlparse(url).query)['code'][0]
|
|
|
|
eurl = "https://graph.facebook.com/oauth/access_token?client_id={app_id}&redirect_uri={redirect_uri}&client_secret={app_secret}&code={code_parameter}".format(app_id=FBAPPID, app_secret=FBSECRET, code_parameter=code, redirect_uri="http://test.morss.it/:facebook/")
|
|
|
|
token = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())['access_token'][0]
|
|
|
|
|
|
|
|
# get long-lived access token
|
|
|
|
eurl = "https://graph.facebook.com/oauth/access_token?grant_type=fb_exchange_token&client_id={app_id}&client_secret={app_secret}&fb_exchange_token={short_lived_token}".format(app_id=FBAPPID, app_secret=FBSECRET, short_lived_token=token)
|
|
|
|
values = urlparse.parse_qs(urllib2.urlopen(eurl).read().strip())
|
|
|
|
|
|
|
|
ltoken = values['access_token'][0]
|
|
|
|
expires = int(time.time() + int(values['expires'][0]))
|
|
|
|
|
|
|
|
# get user id
|
|
|
|
iurl = "https://graph.facebook.com/me?fields=id&access_token={token}".format(ltoken)
|
|
|
|
user_id = json.loads(urllib2.urlopen(iurl).read())['id']
|
|
|
|
|
|
|
|
# do sth out of it
|
|
|
|
facebook.set('t'+ltoken, user_id)
|
|
|
|
facebook.set('e'+ltoken, expires)
|
|
|
|
facebook.set('u'+user_id, ltoken)
|
|
|
|
|
|
|
|
if 'o'+user_id not in token:
|
|
|
|
facebook.set('o'+user_id, ltoken)
|
|
|
|
|
2013-11-24 19:52:53 +00:00
|
|
|
facebook.save()
|
|
|
|
|
2013-11-09 17:48:06 +00:00
|
|
|
if 'REQUEST_URI' in os.environ:
|
|
|
|
print 'Status: 200'
|
|
|
|
print 'Content-Type: text/plain'
|
|
|
|
print ''
|
|
|
|
|
|
|
|
print "token updated"
|
|
|
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
2013-11-09 17:43:16 +00:00
|
|
|
if 'REQUEST_URI' in os.environ:
|
2013-04-16 14:11:34 +00:00
|
|
|
print 'Status: 200'
|
2013-08-24 21:43:32 +00:00
|
|
|
print 'ETag: "%s"' % int(time.time())
|
2013-05-05 13:30:06 +00:00
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
if options.html:
|
2013-09-25 09:13:33 +00:00
|
|
|
print 'Content-Type: text/html'
|
2013-11-30 18:35:24 +00:00
|
|
|
elif options.debug or options.txt:
|
2013-09-15 13:38:03 +00:00
|
|
|
print 'Content-Type: text/plain'
|
2013-11-03 12:18:09 +00:00
|
|
|
elif options.progress:
|
2013-06-25 11:13:23 +00:00
|
|
|
print 'Content-Type: application/octet-stream'
|
2013-05-05 13:30:06 +00:00
|
|
|
else:
|
2013-06-28 11:34:12 +00:00
|
|
|
print 'Content-Type: text/xml'
|
2013-11-03 12:18:09 +00:00
|
|
|
print ''
|
2013-04-16 14:11:34 +00:00
|
|
|
|
2013-11-03 12:24:27 +00:00
|
|
|
HOLD = False
|
|
|
|
|
2013-06-25 11:13:23 +00:00
|
|
|
if url is None:
|
2013-07-14 17:00:16 +00:00
|
|
|
print 'Please provide url.'
|
2013-06-25 11:13:23 +00:00
|
|
|
sys.exit(1)
|
|
|
|
|
2013-11-03 12:18:09 +00:00
|
|
|
if options.progress:
|
2013-05-15 15:56:58 +00:00
|
|
|
MAX_TIME = -1
|
2013-11-03 12:18:09 +00:00
|
|
|
if options.cache:
|
2013-05-15 15:56:58 +00:00
|
|
|
MAX_TIME = 0
|
|
|
|
|
2013-11-03 12:32:24 +00:00
|
|
|
RSS = Gather(url, cachePath, options)
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-11-30 18:39:27 +00:00
|
|
|
if RSS is not False and not options.progress and not DEBUG and not options.silent:
|
|
|
|
print RSS
|
2013-05-01 15:57:09 +00:00
|
|
|
|
2013-09-15 13:38:03 +00:00
|
|
|
if RSS is False and 'progress' not in options:
|
2013-07-14 17:00:16 +00:00
|
|
|
print 'Error fetching feed.'
|
2013-04-15 16:51:55 +00:00
|
|
|
|
|
|
|
log('done')
|