Split SimpleDownload in a lot of Handlers
Cleaner code, easier to edit, more flexibility. Paves the way to SSL certificates validation. Still have to clean up the code of AcceptHeadersHandler.
This commit is contained in:
parent
f46576168a
commit
1b26c5f0e3
morss
182
morss/crawler.py
Normal file
182
morss/crawler.py
Normal file
@ -0,0 +1,182 @@
|
||||
import urllib2
|
||||
import httplib
|
||||
import ssl
|
||||
import socket
|
||||
|
||||
from gzip import GzipFile
|
||||
from StringIO import StringIO
|
||||
|
||||
import re
|
||||
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||
|
||||
|
||||
class GZIPHandler(urllib2.BaseHandler):
|
||||
def http_request(self, req):
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
return req
|
||||
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300:
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
data = resp.read()
|
||||
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
||||
|
||||
fp = StringIO(data)
|
||||
old_resp = resp
|
||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
|
||||
|
||||
def detect_encoding(data, con=None):
|
||||
if con is not None and con.headers.getparam('charset'):
|
||||
return con.headers.getparam('charset')
|
||||
|
||||
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
||||
if match:
|
||||
return match.groups()[0]
|
||||
|
||||
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||
if match:
|
||||
return match.groups()[0].lower()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class EncodingFixHandler(urllib2.BaseHandler):
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||
data = resp.read()
|
||||
enc = detect_encoding(data, resp)
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = StringIO(data)
|
||||
old_resp = resp
|
||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class UAHandler(urllib2.BaseHandler):
|
||||
def __init__(self, useragent=None):
|
||||
self.useragent = useragent
|
||||
|
||||
def http_request(self, req):
|
||||
if self.useragent:
|
||||
req.add_unredirected_header('User-Agent', self.useragent)
|
||||
return req
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class AutoRefererHandler(urllib2.BaseHandler):
|
||||
def http_request(self, req):
|
||||
if req.get_host() != 'feeds.feedburner.com':
|
||||
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
||||
return req
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class ContentNegociationHandler(urllib2.BaseHandler): #FIXME
|
||||
def __init__(self, accept=None, strict=False):
|
||||
self.accept = accept
|
||||
self.strict = strict
|
||||
|
||||
def http_request(self, req):
|
||||
if self.accept is not None:
|
||||
if isinstance(self.accept, basestring):
|
||||
self.accept = (self.accept,)
|
||||
|
||||
out = {}
|
||||
rank = 1.1
|
||||
for group in self.accept:
|
||||
rank -= 0.1
|
||||
|
||||
if isinstance(group, basestring):
|
||||
if group in MIMETYPE:
|
||||
group = MIMETYPE[group]
|
||||
else:
|
||||
out[group] = rank
|
||||
continue
|
||||
|
||||
for mime in group:
|
||||
if mime not in out:
|
||||
out[mime] = rank
|
||||
|
||||
if not self.strict:
|
||||
out['*/*'] = rank - 0.1
|
||||
|
||||
string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
||||
req.add_unredirected_header('Accept', string)
|
||||
|
||||
return req
|
||||
|
||||
https_request = http_request
|
||||
|
||||
|
||||
class MetaRedirectHandler(urllib2.BaseHandler):
|
||||
def http_response(self, req, resp):
|
||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||
if resp.info().type in MIMETYPE['html']:
|
||||
data = resp.read()
|
||||
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
||||
if match:
|
||||
new_url = match.groups()[0]
|
||||
new_headers = dict((k, v) for k, v in req.headers.items()
|
||||
if k.lower() not in ('content-length', 'content-type'))
|
||||
new = urllib2.Request(new_url,
|
||||
headers=new_headers,
|
||||
origin_req_host=req.get_origin_req_host(),
|
||||
unverifiable=True)
|
||||
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
else:
|
||||
fp = StringIO(data)
|
||||
old_resp = resp
|
||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
|
||||
|
||||
class EtagHandler(urllib2.BaseHandler):
|
||||
def __init__(self, cache="", etag=None, lastmodified=None):
|
||||
self.cache = cache
|
||||
self.etag = etag
|
||||
self.lastmodified = lastmodified
|
||||
|
||||
def http_request(self, req):
|
||||
if self.cache:
|
||||
if self.etag:
|
||||
req.add_unredirected_header('If-None-Match', self.etag)
|
||||
if self.lastmodified:
|
||||
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
||||
|
||||
return req
|
||||
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
if self.etag:
|
||||
headers.addheader('etag', self.etag)
|
||||
if self.lastmodified:
|
||||
headers.addheader('last-modified', self.lastmodified)
|
||||
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
||||
return resp
|
||||
|
||||
https_request = http_request
|
154
morss/morss.py
154
morss/morss.py
@ -16,6 +16,7 @@ import lxml.html
|
||||
|
||||
import feeds
|
||||
import feedify
|
||||
import crawler
|
||||
|
||||
import httplib
|
||||
import urllib
|
||||
@ -25,9 +26,6 @@ import urlparse
|
||||
import wsgiref.simple_server
|
||||
import wsgiref.handlers
|
||||
|
||||
from gzip import GzipFile
|
||||
from StringIO import StringIO
|
||||
|
||||
from readability import readability
|
||||
from html2text import HTML2Text
|
||||
|
||||
@ -41,8 +39,7 @@ THREADS = 10 # number of threads (1 for single-threaded)
|
||||
|
||||
DEBUG = False
|
||||
|
||||
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
|
||||
UA_HTML = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||
|
||||
MIMETYPE = {
|
||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||
@ -214,136 +211,20 @@ class Cache:
|
||||
return self
|
||||
|
||||
|
||||
class SimpleDownload(urllib2.HTTPCookieProcessor):
|
||||
"""
|
||||
Custom urllib2 handler to download a page, using etag/last-modified headers,
|
||||
to save bandwidth. The given headers are added back into the header on error
|
||||
304 for easier use.
|
||||
"""
|
||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
||||
crawler.AutoRefererHandler(), crawler.MetaRedirectHandler(),
|
||||
crawler.EncodingFixHandler()]
|
||||
|
||||
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=True, cookiejar=None,
|
||||
accept=None, strict=False):
|
||||
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
|
||||
self.cache = cache
|
||||
self.etag = etag
|
||||
self.lastmodified = lastmodified
|
||||
self.useragent = useragent
|
||||
self.decode = decode
|
||||
self.accept = accept
|
||||
self.strict = strict
|
||||
def accept_handler(*kargs):
|
||||
handlers = default_handlers[:]
|
||||
handlers.append(crawler.ContentNegociationHandler(*kargs))
|
||||
return handlers
|
||||
|
||||
def http_request(self, req):
|
||||
urllib2.HTTPCookieProcessor.http_request(self, req)
|
||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||
req.add_unredirected_header('User-Agent', self.useragent)
|
||||
if req.get_host() != 'feeds.feedburner.com':
|
||||
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
|
||||
|
||||
if self.cache:
|
||||
if self.etag:
|
||||
req.add_unredirected_header('If-None-Match', self.etag)
|
||||
if self.lastmodified:
|
||||
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
|
||||
|
||||
if self.accept is not None:
|
||||
if isinstance(self.accept, basestring):
|
||||
self.accept = (self.accept,)
|
||||
|
||||
out = {}
|
||||
rank = 1.1
|
||||
for group in self.accept:
|
||||
rank -= 0.1
|
||||
|
||||
if isinstance(group, basestring):
|
||||
if group in MIMETYPE:
|
||||
group = MIMETYPE[group]
|
||||
else:
|
||||
out[group] = rank
|
||||
continue
|
||||
|
||||
for mime in group:
|
||||
if mime not in out:
|
||||
out[mime] = rank
|
||||
|
||||
if not self.strict:
|
||||
out['*/*'] = rank - 0.1
|
||||
|
||||
string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
||||
req.add_unredirected_header('Accept', string)
|
||||
|
||||
return req
|
||||
|
||||
def http_error_304(self, req, fp, code, msg, headers):
|
||||
log('http cached')
|
||||
if self.etag:
|
||||
headers.addheader('etag', self.etag)
|
||||
if self.lastmodified:
|
||||
headers.addheader('last-modified', self.lastmodified)
|
||||
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
|
||||
return resp
|
||||
|
||||
def http_response(self, req, resp):
|
||||
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
|
||||
data = resp.read()
|
||||
|
||||
if 200 <= resp.code < 300:
|
||||
# gzip
|
||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||
log('un-gzip')
|
||||
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
||||
|
||||
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||
# <meta> redirect
|
||||
if resp.info().type in MIMETYPE['html']:
|
||||
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
||||
if match:
|
||||
new_url = match.groups()[0]
|
||||
log('redirect: %s' % new_url)
|
||||
|
||||
new_headers = dict((k, v) for k, v in req.headers.items()
|
||||
if k.lower() not in ('content-length', 'content-type'))
|
||||
new = urllib2.Request(new_url,
|
||||
headers=new_headers,
|
||||
origin_req_host=req.get_origin_req_host(),
|
||||
unverifiable=True)
|
||||
|
||||
return self.parent.open(new, timeout=req.timeout)
|
||||
|
||||
# encoding
|
||||
enc = detect_encoding(data, resp)
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
|
||||
if not self.decode:
|
||||
data = data.encode(enc)
|
||||
|
||||
fp = StringIO(data)
|
||||
old_resp = resp
|
||||
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||
resp.msg = old_resp.msg
|
||||
|
||||
return resp
|
||||
|
||||
https_response = http_response
|
||||
https_request = http_request
|
||||
|
||||
|
||||
def detect_encoding(data, con=None):
|
||||
if con is not None and con.headers.getparam('charset'):
|
||||
log('header')
|
||||
return con.headers.getparam('charset')
|
||||
|
||||
match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
||||
if match:
|
||||
log('meta.re')
|
||||
return match.groups()[0]
|
||||
|
||||
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
|
||||
if match:
|
||||
return match.groups()[0].lower()
|
||||
|
||||
return None
|
||||
def etag_handler(accept, strict, cache, etag, lastmodified):
|
||||
handlers = default_handlers[:]
|
||||
handlers.append(crawler.ContentNegociationHandler(accept, strict))
|
||||
handlers.append(crawler.EtagHandler(cache, etag, lastmodified))
|
||||
return handlers
|
||||
|
||||
|
||||
def Fix(item, feedurl='/'):
|
||||
@ -485,7 +366,7 @@ def Fill(item, cache, options, feedurl='/', fast=False):
|
||||
# download
|
||||
try:
|
||||
url = link.encode('utf-8')
|
||||
con = urllib2.build_opener(SimpleDownload(accept=('html', 'text/*'), strict=True)).open(url, timeout=TIMEOUT)
|
||||
con = urllib2.build_opener(*accept_handler(('html', 'text/*'), True)).open(url, timeout=TIMEOUT)
|
||||
data = con.read()
|
||||
except (IOError, httplib.HTTPException) as e:
|
||||
log('http error: %s' % e.message)
|
||||
@ -546,9 +427,8 @@ def Fetch(url, cache, options):
|
||||
style = cache.get('style')
|
||||
else:
|
||||
try:
|
||||
opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'),
|
||||
accept=('xml', 'html'))
|
||||
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT * 2)
|
||||
opener = etag_handler(('xml', 'html'), False, cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
||||
con = urllib2.build_opener(*opener).open(url, timeout=TIMEOUT * 2)
|
||||
xml = con.read()
|
||||
except (urllib2.HTTPError) as e:
|
||||
raise MorssException('Error downloading feed (HTTP Error %s)' % e.code)
|
||||
|
Loading…
x
Reference in New Issue
Block a user