diff --git a/morss/crawler.py b/morss/crawler.py index af32d69..57ff633 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -14,6 +14,88 @@ MIMETYPE = { 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} +# SSL-related code proudly copy-pasted from https://stackoverflow.com/questions/1087227/validate-ssl-certificates-with-python + +class InvalidCertificateException(httplib.HTTPException, urllib2.URLError): + def __init__(self, host, cert, reason): + httplib.HTTPException.__init__(self) + self.host = host + self.cert = cert + self.reason = reason + + def __str__(self): + return ('Host %s returned an invalid certificate (%s) %s\n' % + (self.host, self.reason, self.cert)) + + +class CertValidatingHTTPSConnection(httplib.HTTPConnection): + default_port = httplib.HTTPS_PORT + + def __init__(self, host, port=None, key_file=None, cert_file=None, + ca_certs=None, strict=None, **kwargs): + httplib.HTTPConnection.__init__(self, host, port, strict, **kwargs) + self.key_file = key_file + self.cert_file = cert_file + self.ca_certs = ca_certs + if self.ca_certs: + self.cert_reqs = ssl.CERT_REQUIRED + else: + self.cert_reqs = ssl.CERT_NONE + + def _GetValidHostsForCert(self, cert): + if 'subjectAltName' in cert: + return [x[1] for x in cert['subjectAltName'] + if x[0].lower() == 'dns'] + else: + return [x[0][1] for x in cert['subject'] + if x[0][0].lower() == 'commonname'] + + def _ValidateCertificateHostname(self, cert, hostname): + hosts = self._GetValidHostsForCert(cert) + for host in hosts: + host_re = host.replace('.', '\.').replace('*', '[^.]*') + if re.search('^%s$' % (host_re,), hostname, re.I): + return True + return False + + def connect(self): + sock = socket.create_connection((self.host, self.port)) + self.sock = ssl.wrap_socket(sock, keyfile=self.key_file, + certfile=self.cert_file, + cert_reqs=self.cert_reqs, + ca_certs=self.ca_certs) + if self.cert_reqs & ssl.CERT_REQUIRED: + cert = self.sock.getpeercert() + hostname = self.host.split(':', 0)[0] + if not self._ValidateCertificateHostname(cert, hostname): + raise InvalidCertificateException(hostname, cert, + 'hostname mismatch') + + +class VerifiedHTTPSHandler(urllib2.HTTPSHandler): + def __init__(self, **kwargs): + urllib2.AbstractHTTPHandler.__init__(self) + self._connection_args = kwargs + + def https_open(self, req): + def http_class_wrapper(host, **kwargs): + full_kwargs = dict(self._connection_args) + full_kwargs.update(kwargs) + return CertValidatingHTTPSConnection(host, **full_kwargs) + + try: + return self.do_open(http_class_wrapper, req) + except urllib2.URLError, e: + if type(e.reason) == ssl.SSLError and e.reason.args[0] == 1: + raise InvalidCertificateException(req.host, '', + e.reason.args[1]) + raise + + https_request = urllib2.HTTPSHandler.do_request_ + +# end of copy-paste code + + class GZIPHandler(urllib2.BaseHandler): def http_request(self, req): req.add_unredirected_header('Accept-Encoding', 'gzip') diff --git a/morss/morss.py b/morss/morss.py index 9c89391..74a5d83 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -39,6 +39,7 @@ THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False +CA_CERT = 'cacert.pem' # ca cert file DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' MIMETYPE = { @@ -211,7 +212,8 @@ class Cache: return self -default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), +default_handlers = [crawler.VerifiedHTTPSHandler(ca_certs=CA_CERT), + crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(), crawler.EncodingFixHandler()] @@ -432,6 +434,8 @@ def Fetch(url, cache, options): xml = con.read() except (urllib2.HTTPError) as e: raise MorssException('Error downloading feed (HTTP Error %s)' % e.code) + except (crawler.InvalidCertificateException) as e: + raise MorssException('Error downloading feed (Invalid SSL Certificate)') except (IOError, httplib.HTTPException): raise MorssException('Error downloading feed')