diff --git a/morss/crawler.py b/morss/crawler.py index 621ac6f..fe0ec67 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -5,13 +5,9 @@ from gzip import GzipFile from io import BytesIO try: - from urllib2 import URLError - from urllib2 import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl - from httplib import HTTPException, HTTPConnection, HTTPS_PORT + from urllib2 import BaseHandler, addinfourl, parse_keqv_list, parse_http_list except ImportError: - from urllib.error import URLError - from urllib.request import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl - from http.client import HTTPException, HTTPConnection, HTTPS_PORT + from urllib.request import BaseHandler, addinfourl, parse_keqv_list, parse_http_list import re @@ -25,89 +21,6 @@ MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} - -# SSL-related code proudly copy-pasted from https://stackoverflow.com/questions/1087227/validate-ssl-certificates-with-python - -class InvalidCertificateException(HTTPException, URLError): - def __init__(self, host, cert, reason): - HTTPException.__init__(self) - self.host = host - self.cert = cert - self.reason = reason - - def __str__(self): - return ('Host %s returned an invalid certificate (%s) %s\n' % - (self.host, self.reason, self.cert)) - - -class CertValidatingHTTPSConnection(HTTPConnection): - default_port = HTTPS_PORT - - def __init__(self, host, port=None, key_file=None, cert_file=None, - ca_certs=None, strict=None, **kwargs): - HTTPConnection.__init__(self, host, port, strict, **kwargs) - self.key_file = key_file - self.cert_file = cert_file - self.ca_certs = ca_certs - if self.ca_certs: - self.cert_reqs = ssl.CERT_REQUIRED - else: - self.cert_reqs = ssl.CERT_NONE - - def _GetValidHostsForCert(self, cert): - if 'subjectAltName' in cert: - return [x[1] for x in cert['subjectAltName'] - if x[0].lower() == 'dns'] - else: - return [x[0][1] for x in cert['subject'] - if x[0][0].lower() == 'commonname'] - - def _ValidateCertificateHostname(self, cert, hostname): - hosts = self._GetValidHostsForCert(cert) - for host in hosts: - host_re = host.replace('.', '\.').replace('*', '[^.]*') - if re.search('^%s$' % (host_re,), hostname, re.I): - return True - return False - - def connect(self): - sock = socket.create_connection((self.host, self.port)) - self.sock = ssl.wrap_socket(sock, keyfile=self.key_file, - certfile=self.cert_file, - cert_reqs=self.cert_reqs, - ca_certs=self.ca_certs) - if self.cert_reqs & ssl.CERT_REQUIRED: - cert = self.sock.getpeercert() - hostname = self.host.split(':', 0)[0] - if not self._ValidateCertificateHostname(cert, hostname): - raise InvalidCertificateException(hostname, cert, - 'hostname mismatch') - - -class VerifiedHTTPSHandler(HTTPSHandler): - def __init__(self, **kwargs): - AbstractHTTPHandler.__init__(self) - self._connection_args = kwargs - - def https_open(self, req): - def http_class_wrapper(host, **kwargs): - full_kwargs = dict(self._connection_args) - full_kwargs.update(kwargs) - return CertValidatingHTTPSConnection(host, **full_kwargs) - - try: - return self.do_open(http_class_wrapper, req) - except URLError as e: - if type(e.reason) == ssl.SSLError and e.reason.args[0] == 1: - raise InvalidCertificateException(req.host, '', - e.reason.args[1]) - raise - - https_request = HTTPSHandler.do_request_ - -# end of copy-paste code - - class GZIPHandler(BaseHandler): def http_request(self, req): req.add_unredirected_header('Accept-Encoding', 'gzip') diff --git a/morss/morss.py b/morss/morss.py index 371626a..00682b0 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -232,8 +232,7 @@ class Cache: return self -default_handlers = [crawler.VerifiedHTTPSHandler(ca_certs=CA_CERT), - crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), +default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(), crawler.EncodingFixHandler()]