2 changed files with 89 additions and 82 deletions
--- a/morss/crawler.py
+++ b/morss/crawler.py
@ -23,6 +23,7 @@ from io import BytesIO, StringIO
 import re
 import chardet
 from cgi import parse_header
+import lxml.html
 import time
 import threading
 import random
@ -104,7 +105,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
    }


-def custom_handler(follow=None, delay=None):
+def custom_handler(follow=None, delay=None, encoding=None):
    handlers = []

    # as per urllib2 source code, these Handelers are added first
@ -123,7 +124,7 @@ def custom_handler(follow=None, delay=None):
    handlers.append(HTTPRefreshHandler())
    handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
    handlers.append(BrowserlyHeaderHandler())
-    handlers.append(EncodingFixHandler())
+    handlers.append(EncodingFixHandler(encoding))

    if follow:
        handlers.append(AlternateHandler(MIMETYPE[follow]))
@ -175,51 +176,6 @@ def sanitize_url(url):
    return urlunparse(parts)


-class RespDataHandler(BaseHandler):
-    " Make it easier to use the reponse body "
-
-    def data_reponse(self, req, resp, data):
-        pass
-
-    def http_response(self, req, resp):
-        # read data
-        data = resp.read()
-
-        # process data and use returned content (if any)
-        data = self.data_response(req, resp, data) or data
-
-        # reformat the stuff
-        fp = BytesIO(data)
-        old_resp = resp
-        resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
-        resp.msg = old_resp.msg
-
-        return resp
-
-    https_response = http_response
-
-
-class RespStrHandler(RespDataHandler):
-    " Make it easier to use the _decoded_ reponse body "
-
-    def str_reponse(self, req, resp, data_str):
-        pass
-
-    def data_response(self, req, resp, data):
-        #decode
-        enc = detect_encoding(data, resp)
-        data_str = data.decode(enc, 'replace')
-
-        #process
-        data_str = self.str_response(req, resp, data_str)
-
-        # return
-        data = data_str.encode(enc) if data_str is not None else data
-
-        #return
-        return data
-
-
 class DebugHandler(BaseHandler):
    handler_order = 2000

@ -240,7 +196,7 @@ class SizeLimitHandler(BaseHandler):

    handler_order = 450

-    def __init__(self, limit=5*1024**2):
+    def __init__(self, limit=5*1024^2):
        self.limit = limit

    def http_response(self, req, resp):
@ -261,17 +217,29 @@ def UnGzip(data):
    return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)


-class GZIPHandler(RespDataHandler):
+class GZIPHandler(BaseHandler):
    def http_request(self, req):
        req.add_unredirected_header('Accept-Encoding', 'gzip')
        return req

-    def data_response(self, req, resp, data):
+    def http_response(self, req, resp):
        if 200 <= resp.code < 300:
            if resp.headers.get('Content-Encoding') == 'gzip':
+                data = resp.read()
+
+                data = UnGzip(data)
+
                resp.headers['Content-Encoding'] = 'identity'

-                return UnGzip(data)
+                fp = BytesIO(data)
+                old_resp = resp
+                resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+                resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+    https_request = http_request


 def detect_encoding(data, resp=None):
@ -308,9 +276,28 @@ def detect_raw_encoding(data, resp=None):
    return 'utf-8'


-class EncodingFixHandler(RespStrHandler):
-    def str_response(self, req, resp, data_str):
-        return data_str
+class EncodingFixHandler(BaseHandler):
+    def __init__(self, encoding=None):
+        self.encoding = encoding
+
+    def http_response(self, req, resp):
+        maintype = resp.info().get('Content-Type', '').split('/')[0]
+        if 200 <= resp.code < 300 and maintype == 'text':
+            data = resp.read()
+
+            enc = self.encoding or detect_encoding(data, resp)
+
+            data = data.decode(enc, 'replace')
+            data = data.encode(enc)
+
+            fp = BytesIO(data)
+            old_resp = resp
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response


 class UAHandler(BaseHandler):
@ -336,51 +323,71 @@ class BrowserlyHeaderHandler(BaseHandler):
    https_request = http_request


-def iter_html_tag(html_str, tag_name):
-    re_tag = r'<%s(\s*[^>])*>' % tag_name
-    re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
-
-    for tag_match in re.finditer(re_tag, html_str):
-        attr_match = re.findall(re_attr, tag_match.group(0))
-
-        if attr_match is not None:
-            yield dict(attr_match)
-
-
-class AlternateHandler(RespStrHandler):
+class AlternateHandler(BaseHandler):
    " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "

    def __init__(self, follow=None):
        self.follow = follow or []

-    def str_response(self, req, resp, data_str):
+    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
-
        if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types

-            for link in iter_html_tag(data_str[:10000], 'link'):
-                if (link.get('rel') == 'alternate'
-                        and link.get('type') in self.follow
-                        and 'href' in link):
+            data = resp.read()
+
+            try:
+                links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
+
+                for link in links:
+                    if link.get('type', '') in self.follow:
                        resp.code = 302
                        resp.msg = 'Moved Temporarily'
                        resp.headers['location'] = link.get('href')
                        break

+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass

-class HTTPEquivHandler(RespStrHandler):
+            fp = BytesIO(data)
+            old_resp = resp
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response
+
+
+class HTTPEquivHandler(BaseHandler):
    " Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "

    handler_order = 600

-    def str_response(self, req, resp, data_str):
+    def http_response(self, req, resp):
        contenttype = resp.info().get('Content-Type', '').split(';')[0]
        if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
+            data = resp.read()

-            for meta in iter_html_tag(data_str[:10000], 'meta'):
-                if 'http-equiv' in meta and 'content' in meta:
-                    resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
+            try:
+                headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
+
+                for header in headers:
+                    resp.headers[header.get('http-equiv').lower()] = header.get('content')
+
+            except (ValueError, SyntaxError):
+                # catch parsing errors
+                pass
+
+            fp = BytesIO(data)
+            old_resp = resp
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+
+        return resp
+
+    https_response = http_response


 class HTTPRefreshHandler(BaseHandler):
--- a/morss/wsgi.py
+++ b/morss/wsgi.py
@ -257,7 +257,7 @@ def cgi_error_handler(environ, start_response, app):
    except Exception as e:
        headers = {'status': '500 Oops', 'content-type': 'text/html'}
        start_response(headers['status'], list(headers.items()), sys.exc_info())
-        log('ERROR: %s' % repr(e))
+        log('ERROR: %s' % repr(e), force=True)
        return [cgitb.html(sys.exc_info())]