Split SimpleDownload in a lot of Handlers

Cleaner code, easier to edit, more flexibility. Paves the way to SSL certificates validation. Still have to clean up the code of AcceptHeadersHandler.
2014-11-19 11:57:40 +01:00
parent f46576168a
commit 1b26c5f0e3
2 changed files with 199 additions and 137 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -0,0 +1,182 @@
 import urllib2
 import httplib
 import ssl
 import socket
 from gzip import GzipFile
 from StringIO import StringIO
 import re
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
 class GZIPHandler(urllib2.BaseHandler):
    def http_request(self, req):
        req.add_unredirected_header('Accept-Encoding', 'gzip')
        return req
    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('Content-Encoding') == 'gzip':
                data = resp.read()
                data = GzipFile(fileobj=StringIO(data), mode='r').read()
                fp = StringIO(data)
                old_resp = resp
                resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
                resp.msg = old_resp.msg
        return resp
    https_response = http_response
    https_request = http_request
 def detect_encoding(data, con=None):
    if con is not None and con.headers.getparam('charset'):
        return con.headers.getparam('charset')
    match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
    if match:
        return match.groups()[0]
    match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
    if match:
        return match.groups()[0].lower()
    return None
 class EncodingFixHandler(urllib2.BaseHandler):
    def http_response(self, req, resp):
        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
            data = resp.read()
            enc = detect_encoding(data, resp)
            if enc:
                data = data.decode(enc, 'replace')
                data = data.encode(enc)
            fp = StringIO(data)
            old_resp = resp
            resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
            resp.msg = old_resp.msg
        return resp
    https_response = http_response
 class UAHandler(urllib2.BaseHandler):
    def __init__(self, useragent=None):
        self.useragent = useragent
    def http_request(self, req):
        if self.useragent:
            req.add_unredirected_header('User-Agent', self.useragent)
        return req
    https_request = http_request
 class AutoRefererHandler(urllib2.BaseHandler):
    def http_request(self, req):
        if req.get_host() != 'feeds.feedburner.com':
            req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
        return req
    https_request = http_request
 class ContentNegociationHandler(urllib2.BaseHandler): #FIXME
    def __init__(self, accept=None, strict=False):
        self.accept = accept
        self.strict = strict
    def http_request(self, req):
        if self.accept is not None:
            if isinstance(self.accept, basestring):
                self.accept = (self.accept,)
            out = {}
            rank = 1.1
            for group in self.accept:
                rank -= 0.1
                if isinstance(group, basestring):
                    if group in MIMETYPE:
                        group = MIMETYPE[group]
                    else:
                        out[group] = rank
                        continue
                for mime in group:
                    if mime not in out:
                        out[mime] = rank
            if not self.strict:
                out['*/*'] = rank - 0.1
            string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
            req.add_unredirected_header('Accept', string)
        return req
    https_request = http_request
 class MetaRedirectHandler(urllib2.BaseHandler):
    def http_response(self, req, resp):
        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
            if resp.info().type in MIMETYPE['html']:
                data = resp.read()
                match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
                if match:
                    new_url = match.groups()[0]
                    new_headers = dict((k, v) for k, v in req.headers.items()
                                       if k.lower() not in ('content-length', 'content-type'))
                    new = urllib2.Request(new_url,
                                          headers=new_headers,
                                          origin_req_host=req.get_origin_req_host(),
                                          unverifiable=True)
                    return self.parent.open(new, timeout=req.timeout)
                else:
                    fp = StringIO(data)
                    old_resp = resp
                    resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
                    resp.msg = old_resp.msg
        return resp
    https_response = http_response
 class EtagHandler(urllib2.BaseHandler):
    def __init__(self, cache="", etag=None, lastmodified=None):
        self.cache = cache
        self.etag = etag
        self.lastmodified = lastmodified
    def http_request(self, req):
        if self.cache:
            if self.etag:
                req.add_unredirected_header('If-None-Match', self.etag)
            if self.lastmodified:
                req.add_unredirected_header('If-Modified-Since', self.lastmodified)
        return req
    def http_error_304(self, req, fp, code, msg, headers):
        if self.etag:
            headers.addheader('etag', self.etag)
        if self.lastmodified:
            headers.addheader('last-modified', self.lastmodified)
        resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
        return resp
    https_request = http_request
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -16,6 +16,7 @@ import lxml.html
 import feeds
 import feedify
 import crawler
 import httplib
 import urllib
@@ -25,9 +26,6 @@ import urlparse
 import wsgiref.simple_server
 import wsgiref.handlers
 from gzip import GzipFile
 from StringIO import StringIO
 from readability import readability
 from html2text import HTML2Text
@@ -41,8 +39,7 @@ THREADS = 10  # number of threads (1 for single-threaded)
 DEBUG = False
-UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
+DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
@@ -214,136 +211,20 @@ class Cache:
            return self
-class SimpleDownload(urllib2.HTTPCookieProcessor):
+default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
-    """
+                    crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
-    Custom urllib2 handler to download a page, using etag/last-modified headers,
+                    crawler.EncodingFixHandler()]
    to save bandwidth. The given headers are added back into the header on error
    304 for easier use.
    """
-    def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
+def accept_handler(*kargs):
-                 accept=None, strict=False):
+    handlers = default_handlers[:]
-        urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
+    handlers.append(crawler.ContentNegociationHandler(*kargs))
-        self.cache = cache
+    return handlers
        self.etag = etag
        self.lastmodified = lastmodified
        self.useragent = useragent
        self.decode = decode
        self.accept = accept
        self.strict = strict
-    def http_request(self, req):
+def etag_handler(accept, strict, cache, etag, lastmodified):
-        urllib2.HTTPCookieProcessor.http_request(self, req)
+    handlers = default_handlers[:]
-        req.add_unredirected_header('Accept-Encoding', 'gzip')
+    handlers.append(crawler.ContentNegociationHandler(accept, strict))
-        req.add_unredirected_header('User-Agent', self.useragent)
+    handlers.append(crawler.EtagHandler(cache, etag, lastmodified))
-        if req.get_host() != 'feeds.feedburner.com':
+    return handlers
            req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
        if self.cache:
            if self.etag:
                req.add_unredirected_header('If-None-Match', self.etag)
            if self.lastmodified:
                req.add_unredirected_header('If-Modified-Since', self.lastmodified)
        if self.accept is not None:
            if isinstance(self.accept, basestring):
                self.accept = (self.accept,)
            out = {}
            rank = 1.1
            for group in self.accept:
                rank -= 0.1
                if isinstance(group, basestring):
                    if group in MIMETYPE:
                        group = MIMETYPE[group]
                    else:
                        out[group] = rank
                        continue
                for mime in group:
                    if mime not in out:
                        out[mime] = rank
            if not self.strict:
                out['*/*'] = rank - 0.1
            string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
            req.add_unredirected_header('Accept', string)
        return req
    def http_error_304(self, req, fp, code, msg, headers):
        log('http cached')
        if self.etag:
            headers.addheader('etag', self.etag)
        if self.lastmodified:
            headers.addheader('last-modified', self.lastmodified)
        resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
        return resp
    def http_response(self, req, resp):
        urllib2.HTTPCookieProcessor.http_response(self, req, resp)
        data = resp.read()
        if 200 <= resp.code < 300:
            # gzip
            if resp.headers.get('Content-Encoding') == 'gzip':
                log('un-gzip')
                data = GzipFile(fileobj=StringIO(data), mode='r').read()
        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
            # <meta> redirect
            if resp.info().type in MIMETYPE['html']:
                match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
                if match:
                    new_url = match.groups()[0]
                    log('redirect: %s' % new_url)
                    new_headers = dict((k, v) for k, v in req.headers.items()
                                       if k.lower() not in ('content-length', 'content-type'))
                    new = urllib2.Request(new_url,
                                          headers=new_headers,
                                          origin_req_host=req.get_origin_req_host(),
                                          unverifiable=True)
                    return self.parent.open(new, timeout=req.timeout)
            # encoding
            enc = detect_encoding(data, resp)
            if enc:
                data = data.decode(enc, 'replace')
                if not self.decode:
                    data = data.encode(enc)
        fp = StringIO(data)
        old_resp = resp
        resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
        resp.msg = old_resp.msg
        return resp
    https_response = http_response
    https_request = http_request
 def detect_encoding(data, con=None):
    if con is not None and con.headers.getparam('charset'):
        log('header')
        return con.headers.getparam('charset')
    match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
    if match:
        log('meta.re')
        return match.groups()[0]
    match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
    if match:
        return match.groups()[0].lower()
    return None
 def Fix(item, feedurl='/'):
@@ -485,7 +366,7 @@ def Fill(item, cache, options, feedurl='/', fast=False):
    # download
    try:
        url = link.encode('utf-8')
-        con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
+        con = urllib2.build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT)
        data = con.read()
    except (IOError, httplib.HTTPException) as e:
        log('http error:  %s' % e.message)
@@ -546,9 +427,8 @@ def Fetch(url, cache, options):
        style = cache.get('style')
    else:
        try:
-            opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
+            opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified'))
-                                    accept=('xml', 'html'))
+            con = urllib2.build_opener(*opener).open(url, timeout=TIMEOUT * 2)
            con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
            xml = con.read()
        except (urllib2.HTTPError) as e:
            raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)