parent
1b26c5f0e3
commit
8131ea2244
|
@ -14,6 +14,88 @@ MIMETYPE = {
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||||
|
|
||||||
|
|
||||||
|
# SSL-related code proudly copy-pasted from https://stackoverflow.com/questions/1087227/validate-ssl-certificates-with-python
|
||||||
|
|
||||||
|
class InvalidCertificateException(httplib.HTTPException, urllib2.URLError):
|
||||||
|
def __init__(self, host, cert, reason):
|
||||||
|
httplib.HTTPException.__init__(self)
|
||||||
|
self.host = host
|
||||||
|
self.cert = cert
|
||||||
|
self.reason = reason
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return ('Host %s returned an invalid certificate (%s) %s\n' %
|
||||||
|
(self.host, self.reason, self.cert))
|
||||||
|
|
||||||
|
|
||||||
|
class CertValidatingHTTPSConnection(httplib.HTTPConnection):
|
||||||
|
default_port = httplib.HTTPS_PORT
|
||||||
|
|
||||||
|
def __init__(self, host, port=None, key_file=None, cert_file=None,
|
||||||
|
ca_certs=None, strict=None, **kwargs):
|
||||||
|
httplib.HTTPConnection.__init__(self, host, port, strict, **kwargs)
|
||||||
|
self.key_file = key_file
|
||||||
|
self.cert_file = cert_file
|
||||||
|
self.ca_certs = ca_certs
|
||||||
|
if self.ca_certs:
|
||||||
|
self.cert_reqs = ssl.CERT_REQUIRED
|
||||||
|
else:
|
||||||
|
self.cert_reqs = ssl.CERT_NONE
|
||||||
|
|
||||||
|
def _GetValidHostsForCert(self, cert):
|
||||||
|
if 'subjectAltName' in cert:
|
||||||
|
return [x[1] for x in cert['subjectAltName']
|
||||||
|
if x[0].lower() == 'dns']
|
||||||
|
else:
|
||||||
|
return [x[0][1] for x in cert['subject']
|
||||||
|
if x[0][0].lower() == 'commonname']
|
||||||
|
|
||||||
|
def _ValidateCertificateHostname(self, cert, hostname):
|
||||||
|
hosts = self._GetValidHostsForCert(cert)
|
||||||
|
for host in hosts:
|
||||||
|
host_re = host.replace('.', '\.').replace('*', '[^.]*')
|
||||||
|
if re.search('^%s$' % (host_re,), hostname, re.I):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
sock = socket.create_connection((self.host, self.port))
|
||||||
|
self.sock = ssl.wrap_socket(sock, keyfile=self.key_file,
|
||||||
|
certfile=self.cert_file,
|
||||||
|
cert_reqs=self.cert_reqs,
|
||||||
|
ca_certs=self.ca_certs)
|
||||||
|
if self.cert_reqs & ssl.CERT_REQUIRED:
|
||||||
|
cert = self.sock.getpeercert()
|
||||||
|
hostname = self.host.split(':', 0)[0]
|
||||||
|
if not self._ValidateCertificateHostname(cert, hostname):
|
||||||
|
raise InvalidCertificateException(hostname, cert,
|
||||||
|
'hostname mismatch')
|
||||||
|
|
||||||
|
|
||||||
|
class VerifiedHTTPSHandler(urllib2.HTTPSHandler):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
urllib2.AbstractHTTPHandler.__init__(self)
|
||||||
|
self._connection_args = kwargs
|
||||||
|
|
||||||
|
def https_open(self, req):
|
||||||
|
def http_class_wrapper(host, **kwargs):
|
||||||
|
full_kwargs = dict(self._connection_args)
|
||||||
|
full_kwargs.update(kwargs)
|
||||||
|
return CertValidatingHTTPSConnection(host, **full_kwargs)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return self.do_open(http_class_wrapper, req)
|
||||||
|
except urllib2.URLError, e:
|
||||||
|
if type(e.reason) == ssl.SSLError and e.reason.args[0] == 1:
|
||||||
|
raise InvalidCertificateException(req.host, '',
|
||||||
|
e.reason.args[1])
|
||||||
|
raise
|
||||||
|
|
||||||
|
https_request = urllib2.HTTPSHandler.do_request_
|
||||||
|
|
||||||
|
# end of copy-paste code
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(urllib2.BaseHandler):
|
class GZIPHandler(urllib2.BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
|
|
|
@ -39,6 +39,7 @@ THREADS = 10 # number of threads (1 for single-threaded)
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
|
||||||
|
CA_CERT = 'cacert.pem' # ca cert file
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
MIMETYPE = {
|
MIMETYPE = {
|
||||||
|
@ -211,7 +212,8 @@ class Cache:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
default_handlers = [crawler.VerifiedHTTPSHandler(ca_certs=CA_CERT),
|
||||||
|
crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
||||||
crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
|
crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
|
||||||
crawler.EncodingFixHandler()]
|
crawler.EncodingFixHandler()]
|
||||||
|
|
||||||
|
@ -432,6 +434,8 @@ def Fetch(url, cache, options):
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
except (urllib2.HTTPError) as e:
|
except (urllib2.HTTPError) as e:
|
||||||
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
||||||
|
except (crawler.InvalidCertificateException) as e:
|
||||||
|
raise MorssException('Error downloading feed (Invalid SSL Certificate)')
|
||||||
except (IOError, httplib.HTTPException):
|
except (IOError, httplib.HTTPException):
|
||||||
raise MorssException('Error downloading feed')
|
raise MorssException('Error downloading feed')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue