From 1b26c5f0e33b7d6fc064e2df31f9f04fe49df7bf Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 19 Nov 2014 11:57:40 +0100 Subject: [PATCH] Split SimpleDownload in a lot of Handlers Cleaner code, easier to edit, more flexibility. Paves the way to SSL certificates validation. Still have to clean up the code of AcceptHeadersHandler. --- morss/crawler.py | 182 +++++++++++++++++++++++++++++++++++++++++++++++ morss/morss.py | 154 +++++---------------------------------- 2 files changed, 199 insertions(+), 137 deletions(-) create mode 100644 morss/crawler.py diff --git a/morss/crawler.py b/morss/crawler.py new file mode 100644 index 0000000..af32d69 --- /dev/null +++ b/morss/crawler.py @@ -0,0 +1,182 @@ +import urllib2 +import httplib +import ssl +import socket + +from gzip import GzipFile +from StringIO import StringIO + +import re + + +MIMETYPE = { + 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], + 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} + + +class GZIPHandler(urllib2.BaseHandler): + def http_request(self, req): + req.add_unredirected_header('Accept-Encoding', 'gzip') + return req + + def http_response(self, req, resp): + if 200 <= resp.code < 300: + if resp.headers.get('Content-Encoding') == 'gzip': + data = resp.read() + data = GzipFile(fileobj=StringIO(data), mode='r').read() + + fp = StringIO(data) + old_resp = resp + resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + + return resp + + https_response = http_response + https_request = http_request + + +def detect_encoding(data, con=None): + if con is not None and con.headers.getparam('charset'): + return con.headers.getparam('charset') + + match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) + if match: + return match.groups()[0] + + match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) + if match: + return match.groups()[0].lower() + + return None + + +class EncodingFixHandler(urllib2.BaseHandler): + def http_response(self, req, resp): + if 200 <= resp.code < 300 and resp.info().maintype == 'text': + data = resp.read() + enc = detect_encoding(data, resp) + + if enc: + data = data.decode(enc, 'replace') + data = data.encode(enc) + + fp = StringIO(data) + old_resp = resp + resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + + return resp + + https_response = http_response + + +class UAHandler(urllib2.BaseHandler): + def __init__(self, useragent=None): + self.useragent = useragent + + def http_request(self, req): + if self.useragent: + req.add_unredirected_header('User-Agent', self.useragent) + return req + + https_request = http_request + + +class AutoRefererHandler(urllib2.BaseHandler): + def http_request(self, req): + if req.get_host() != 'feeds.feedburner.com': + req.add_unredirected_header('Referer', 'http://%s' % req.get_host()) + return req + + https_request = http_request + + +class ContentNegociationHandler(urllib2.BaseHandler): #FIXME + def __init__(self, accept=None, strict=False): + self.accept = accept + self.strict = strict + + def http_request(self, req): + if self.accept is not None: + if isinstance(self.accept, basestring): + self.accept = (self.accept,) + + out = {} + rank = 1.1 + for group in self.accept: + rank -= 0.1 + + if isinstance(group, basestring): + if group in MIMETYPE: + group = MIMETYPE[group] + else: + out[group] = rank + continue + + for mime in group: + if mime not in out: + out[mime] = rank + + if not self.strict: + out['*/*'] = rank - 0.1 + + string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) + req.add_unredirected_header('Accept', string) + + return req + + https_request = http_request + + +class MetaRedirectHandler(urllib2.BaseHandler): + def http_response(self, req, resp): + if 200 <= resp.code < 300 and resp.info().maintype == 'text': + if resp.info().type in MIMETYPE['html']: + data = resp.read() + match = re.search(r'(?i)]*?url=(http.*?)["\']', data) + if match: + new_url = match.groups()[0] + new_headers = dict((k, v) for k, v in req.headers.items() + if k.lower() not in ('content-length', 'content-type')) + new = urllib2.Request(new_url, + headers=new_headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) + + return self.parent.open(new, timeout=req.timeout) + else: + fp = StringIO(data) + old_resp = resp + resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + + return resp + + https_response = http_response + + +class EtagHandler(urllib2.BaseHandler): + def __init__(self, cache="", etag=None, lastmodified=None): + self.cache = cache + self.etag = etag + self.lastmodified = lastmodified + + def http_request(self, req): + if self.cache: + if self.etag: + req.add_unredirected_header('If-None-Match', self.etag) + if self.lastmodified: + req.add_unredirected_header('If-Modified-Since', self.lastmodified) + + return req + + def http_error_304(self, req, fp, code, msg, headers): + if self.etag: + headers.addheader('etag', self.etag) + if self.lastmodified: + headers.addheader('last-modified', self.lastmodified) + resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200) + return resp + + https_request = http_request diff --git a/morss/morss.py b/morss/morss.py index f108a98..9c89391 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -16,6 +16,7 @@ import lxml.html import feeds import feedify +import crawler import httplib import urllib @@ -25,9 +26,6 @@ import urlparse import wsgiref.simple_server import wsgiref.handlers -from gzip import GzipFile -from StringIO import StringIO - from readability import readability from html2text import HTML2Text @@ -41,8 +39,7 @@ THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False -UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)' -UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' +DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], @@ -214,136 +211,20 @@ class Cache: return self -class SimpleDownload(urllib2.HTTPCookieProcessor): - """ - Custom urllib2 handler to download a page, using etag/last-modified headers, - to save bandwidth. The given headers are added back into the header on error - 304 for easier use. - """ +default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), + crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(), + crawler.EncodingFixHandler()] - def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None, - accept=None, strict=False): - urllib2.HTTPCookieProcessor.__init__(self, cookiejar) - self.cache = cache - self.etag = etag - self.lastmodified = lastmodified - self.useragent = useragent - self.decode = decode - self.accept = accept - self.strict = strict +def accept_handler(*kargs): + handlers = default_handlers[:] + handlers.append(crawler.ContentNegociationHandler(*kargs)) + return handlers - def http_request(self, req): - urllib2.HTTPCookieProcessor.http_request(self, req) - req.add_unredirected_header('Accept-Encoding', 'gzip') - req.add_unredirected_header('User-Agent', self.useragent) - if req.get_host() != 'feeds.feedburner.com': - req.add_unredirected_header('Referer', 'http://%s' % req.get_host()) - - if self.cache: - if self.etag: - req.add_unredirected_header('If-None-Match', self.etag) - if self.lastmodified: - req.add_unredirected_header('If-Modified-Since', self.lastmodified) - - if self.accept is not None: - if isinstance(self.accept, basestring): - self.accept = (self.accept,) - - out = {} - rank = 1.1 - for group in self.accept: - rank -= 0.1 - - if isinstance(group, basestring): - if group in MIMETYPE: - group = MIMETYPE[group] - else: - out[group] = rank - continue - - for mime in group: - if mime not in out: - out[mime] = rank - - if not self.strict: - out['*/*'] = rank - 0.1 - - string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) - req.add_unredirected_header('Accept', string) - - return req - - def http_error_304(self, req, fp, code, msg, headers): - log('http cached') - if self.etag: - headers.addheader('etag', self.etag) - if self.lastmodified: - headers.addheader('last-modified', self.lastmodified) - resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200) - return resp - - def http_response(self, req, resp): - urllib2.HTTPCookieProcessor.http_response(self, req, resp) - data = resp.read() - - if 200 <= resp.code < 300: - # gzip - if resp.headers.get('Content-Encoding') == 'gzip': - log('un-gzip') - data = GzipFile(fileobj=StringIO(data), mode='r').read() - - if 200 <= resp.code < 300 and resp.info().maintype == 'text': - # redirect - if resp.info().type in MIMETYPE['html']: - match = re.search(r'(?i)]*?url=(http.*?)["\']', data) - if match: - new_url = match.groups()[0] - log('redirect: %s' % new_url) - - new_headers = dict((k, v) for k, v in req.headers.items() - if k.lower() not in ('content-length', 'content-type')) - new = urllib2.Request(new_url, - headers=new_headers, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - - return self.parent.open(new, timeout=req.timeout) - - # encoding - enc = detect_encoding(data, resp) - - if enc: - data = data.decode(enc, 'replace') - - if not self.decode: - data = data.encode(enc) - - fp = StringIO(data) - old_resp = resp - resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - - return resp - - https_response = http_response - https_request = http_request - - -def detect_encoding(data, con=None): - if con is not None and con.headers.getparam('charset'): - log('header') - return con.headers.getparam('charset') - - match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) - if match: - log('meta.re') - return match.groups()[0] - - match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) - if match: - return match.groups()[0].lower() - - return None +def etag_handler(accept, strict, cache, etag, lastmodified): + handlers = default_handlers[:] + handlers.append(crawler.ContentNegociationHandler(accept, strict)) + handlers.append(crawler.EtagHandler(cache, etag, lastmodified)) + return handlers def Fix(item, feedurl='/'): @@ -485,7 +366,7 @@ def Fill(item, cache, options, feedurl='/', fast=False): # download try: url = link.encode('utf-8') - con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT) + con = urllib2.build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT) data = con.read() except (IOError, httplib.HTTPException) as e: log('http error: %s' % e.message) @@ -546,9 +427,8 @@ def Fetch(url, cache, options): style = cache.get('style') else: try: - opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), - accept=('xml', 'html')) - con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2) + opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified')) + con = urllib2.build_opener(*opener).open(url, timeout=TIMEOUT * 2) xml = con.read() except (urllib2.HTTPError) as e: raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)