Split SimpleDownload in a lot of Handlers

Cleaner code, easier to edit, more flexibility. Paves the way to SSL certificates validation. Still have to clean up the code of AcceptHeadersHandler.
2014-11-19 11:57:40 +01:00
parent f46576168a
commit 1b26c5f0e3
2 changed files with 199 additions and 137 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -0,0 +1,182 @@
+import urllib2
+import httplib
+import ssl
+import socket
+
+from gzip import GzipFile
+from StringIO import StringIO
+
+import re
+
+
+MIMETYPE = {
+    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
+    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
+
+
+class GZIPHandler(urllib2.BaseHandler):
+    def http_request(self, req):
+        req.add_unredirected_header('Accept-Encoding', 'gzip')
+        return req
+
+    def http_response(self, req, resp):
+        if 200 <= resp.code < 300:
+            if resp.headers.get('Content-Encoding') == 'gzip':
+                data = resp.read()
+                data = GzipFile(fileobj=StringIO(data), mode='r').read()
+
+                fp = StringIO(data)
+                old_resp = resp
+                resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+                resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+    https_request = http_request
+
+
+def detect_encoding(data, con=None):
+    if con is not None and con.headers.getparam('charset'):
+        return con.headers.getparam('charset')
+
+    match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
+    if match:
+        return match.groups()[0]
+
+    match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
+    if match:
+        return match.groups()[0].lower()
+
+    return None
+
+
+class EncodingFixHandler(urllib2.BaseHandler):
+    def http_response(self, req, resp):
+        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
+            data = resp.read()
+            enc = detect_encoding(data, resp)
+
+            if enc:
+                data = data.decode(enc, 'replace')
+                data = data.encode(enc)
+
+            fp = StringIO(data)
+            old_resp = resp
+            resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+
+
+class UAHandler(urllib2.BaseHandler):
+    def __init__(self, useragent=None):
+        self.useragent = useragent
+
+    def http_request(self, req):
+        if self.useragent:
+            req.add_unredirected_header('User-Agent', self.useragent)
+        return req
+
+    https_request = http_request
+
+
+class AutoRefererHandler(urllib2.BaseHandler):
+    def http_request(self, req):
+        if req.get_host() != 'feeds.feedburner.com':
+            req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
+        return req
+
+    https_request = http_request
+
+
+class ContentNegociationHandler(urllib2.BaseHandler): #FIXME
+    def __init__(self, accept=None, strict=False):
+        self.accept = accept
+        self.strict = strict
+
+    def http_request(self, req):
+        if self.accept is not None:
+            if isinstance(self.accept, basestring):
+                self.accept = (self.accept,)
+
+            out = {}
+            rank = 1.1
+            for group in self.accept:
+                rank -= 0.1
+
+                if isinstance(group, basestring):
+                    if group in MIMETYPE:
+                        group = MIMETYPE[group]
+                    else:
+                        out[group] = rank
+                        continue
+
+                for mime in group:
+                    if mime not in out:
+                        out[mime] = rank
+
+            if not self.strict:
+                out['*/*'] = rank - 0.1
+
+            string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
+            req.add_unredirected_header('Accept', string)
+
+        return req
+
+    https_request = http_request
+
+
+class MetaRedirectHandler(urllib2.BaseHandler):
+    def http_response(self, req, resp):
+        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
+            if resp.info().type in MIMETYPE['html']:
+                data = resp.read()
+                match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
+                if match:
+                    new_url = match.groups()[0]
+                    new_headers = dict((k, v) for k, v in req.headers.items()
+                                       if k.lower() not in ('content-length', 'content-type'))
+                    new = urllib2.Request(new_url,
+                                          headers=new_headers,
+                                          origin_req_host=req.get_origin_req_host(),
+                                          unverifiable=True)
+
+                    return self.parent.open(new, timeout=req.timeout)
+                else:
+                    fp = StringIO(data)
+                    old_resp = resp
+                    resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+                    resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+
+
+class EtagHandler(urllib2.BaseHandler):
+    def __init__(self, cache="", etag=None, lastmodified=None):
+        self.cache = cache
+        self.etag = etag
+        self.lastmodified = lastmodified
+
+    def http_request(self, req):
+        if self.cache:
+            if self.etag:
+                req.add_unredirected_header('If-None-Match', self.etag)
+            if self.lastmodified:
+                req.add_unredirected_header('If-Modified-Since', self.lastmodified)
+
+        return req
+
+    def http_error_304(self, req, fp, code, msg, headers):
+        if self.etag:
+            headers.addheader('etag', self.etag)
+        if self.lastmodified:
+            headers.addheader('last-modified', self.lastmodified)
+        resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
+        return resp
+
+    https_request = http_request
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -16,6 +16,7 @@ import lxml.html

 import feeds
 import feedify
+import crawler

 import httplib
 import urllib
@@ -25,9 +26,6 @@ import urlparse
 import wsgiref.simple_server
 import wsgiref.handlers

-from gzip import GzipFile
-from StringIO import StringIO
-
 from readability import readability
 from html2text import HTML2Text

@@ -41,8 +39,7 @@ THREADS = 10  # number of threads (1 for single-threaded)

 DEBUG = False

-UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
-UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
+DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'

 MIMETYPE = {
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
@@ -214,136 +211,20 @@ class Cache:
            return self


-class SimpleDownload(urllib2.HTTPCookieProcessor):
-    """
-    Custom urllib2 handler to download a page, using etag/last-modified headers,
-    to save bandwidth. The given headers are added back into the header on error
-    304 for easier use.
-    """
+default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
+                    crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
+                    crawler.EncodingFixHandler()]

-    def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
-                 accept=None, strict=False):
-        urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
-        self.cache = cache
-        self.etag = etag
-        self.lastmodified = lastmodified
-        self.useragent = useragent
-        self.decode = decode
-        self.accept = accept
-        self.strict = strict
+def accept_handler(*kargs):
+    handlers = default_handlers[:]
+    handlers.append(crawler.ContentNegociationHandler(*kargs))
+    return handlers

-    def http_request(self, req):
-        urllib2.HTTPCookieProcessor.http_request(self, req)
-        req.add_unredirected_header('Accept-Encoding', 'gzip')
-        req.add_unredirected_header('User-Agent', self.useragent)
-        if req.get_host() != 'feeds.feedburner.com':
-            req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
-
-        if self.cache:
-            if self.etag:
-                req.add_unredirected_header('If-None-Match', self.etag)
-            if self.lastmodified:
-                req.add_unredirected_header('If-Modified-Since', self.lastmodified)
-
-        if self.accept is not None:
-            if isinstance(self.accept, basestring):
-                self.accept = (self.accept,)
-
-            out = {}
-            rank = 1.1
-            for group in self.accept:
-                rank -= 0.1
-
-                if isinstance(group, basestring):
-                    if group in MIMETYPE:
-                        group = MIMETYPE[group]
-                    else:
-                        out[group] = rank
-                        continue
-
-                for mime in group:
-                    if mime not in out:
-                        out[mime] = rank
-
-            if not self.strict:
-                out['*/*'] = rank - 0.1
-
-            string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
-            req.add_unredirected_header('Accept', string)
-
-        return req
-
-    def http_error_304(self, req, fp, code, msg, headers):
-        log('http cached')
-        if self.etag:
-            headers.addheader('etag', self.etag)
-        if self.lastmodified:
-            headers.addheader('last-modified', self.lastmodified)
-        resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
-        return resp
-
-    def http_response(self, req, resp):
-        urllib2.HTTPCookieProcessor.http_response(self, req, resp)
-        data = resp.read()
-
-        if 200 <= resp.code < 300:
-            # gzip
-            if resp.headers.get('Content-Encoding') == 'gzip':
-                log('un-gzip')
-                data = GzipFile(fileobj=StringIO(data), mode='r').read()
-
-        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
-            # <meta> redirect
-            if resp.info().type in MIMETYPE['html']:
-                match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
-                if match:
-                    new_url = match.groups()[0]
-                    log('redirect: %s' % new_url)
-
-                    new_headers = dict((k, v) for k, v in req.headers.items()
-                                       if k.lower() not in ('content-length', 'content-type'))
-                    new = urllib2.Request(new_url,
-                                          headers=new_headers,
-                                          origin_req_host=req.get_origin_req_host(),
-                                          unverifiable=True)
-
-                    return self.parent.open(new, timeout=req.timeout)
-
-            # encoding
-            enc = detect_encoding(data, resp)
-
-            if enc:
-                data = data.decode(enc, 'replace')
-
-                if not self.decode:
-                    data = data.encode(enc)
-
-        fp = StringIO(data)
-        old_resp = resp
-        resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-        resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
-    https_request = http_request
-
-
-def detect_encoding(data, con=None):
-    if con is not None and con.headers.getparam('charset'):
-        log('header')
-        return con.headers.getparam('charset')
-
-    match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
-    if match:
-        log('meta.re')
-        return match.groups()[0]
-
-    match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
-    if match:
-        return match.groups()[0].lower()
-
-    return None
+def etag_handler(accept, strict, cache, etag, lastmodified):
+    handlers = default_handlers[:]
+    handlers.append(crawler.ContentNegociationHandler(accept, strict))
+    handlers.append(crawler.EtagHandler(cache, etag, lastmodified))
+    return handlers


 def Fix(item, feedurl='/'):
@@ -485,7 +366,7 @@ def Fill(item, cache, options, feedurl='/', fast=False):
    # download
    try:
        url = link.encode('utf-8')
-        con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
+        con = urllib2.build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT)
        data = con.read()
    except (IOError, httplib.HTTPException) as e:
        log('http error:  %s' % e.message)
@@ -546,9 +427,8 @@ def Fetch(url, cache, options):
        style = cache.get('style')
    else:
        try:
-            opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
-                                    accept=('xml', 'html'))
-            con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
+            opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified'))
+            con = urllib2.build_opener(*opener).open(url, timeout=TIMEOUT * 2)
            xml = con.read()
        except (urllib2.HTTPError) as e:
            raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)