2015-04-06 15:26:12 +00:00
import sys
2014-11-19 10:57:40 +00:00
import ssl
import socket
from gzip import GzipFile
2015-04-06 15:26:12 +00:00
from io import BytesIO , StringIO
import re
import sqlite3
import time
2015-02-25 09:53:36 +00:00
try :
2015-04-06 14:54:59 +00:00
from urllib2 import BaseHandler , addinfourl , parse_keqv_list , parse_http_list
2015-04-06 15:26:12 +00:00
import mimetools
2015-02-25 10:07:09 +00:00
except ImportError :
2015-04-06 14:54:59 +00:00
from urllib . request import BaseHandler , addinfourl , parse_keqv_list , parse_http_list
2015-04-06 15:26:12 +00:00
import email
2014-11-19 10:57:40 +00:00
2015-02-25 16:50:23 +00:00
try :
basestring
except NameError :
2015-04-06 15:26:12 +00:00
basestring = unicode = str
buffer = memoryview
2015-02-25 16:50:23 +00:00
2014-11-19 10:57:40 +00:00
MIMETYPE = {
' xml ' : [ ' text/xml ' , ' application/xml ' , ' application/rss+xml ' , ' application/rdf+xml ' , ' application/atom+xml ' ] ,
' html ' : [ ' text/html ' , ' application/xhtml+xml ' , ' application/xml ' ] }
2015-02-25 09:53:36 +00:00
class GZIPHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
req . add_unredirected_header ( ' Accept-Encoding ' , ' gzip ' )
return req
def http_response ( self , req , resp ) :
if 200 < = resp . code < 300 :
if resp . headers . get ( ' Content-Encoding ' ) == ' gzip ' :
data = resp . read ( )
2015-03-10 17:05:02 +00:00
data = GzipFile ( fileobj = BytesIO ( data ) , mode = ' r ' ) . read ( )
2014-11-19 10:57:40 +00:00
2015-03-10 17:05:02 +00:00
fp = BytesIO ( data )
2014-11-19 10:57:40 +00:00
old_resp = resp
2015-02-25 09:53:36 +00:00
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
2014-11-19 10:57:40 +00:00
resp . msg = old_resp . msg
return resp
https_response = http_response
https_request = http_request
def detect_encoding ( data , con = None ) :
2015-03-10 17:03:16 +00:00
if con is not None and con . info ( ) . get ( ' charset ' ) :
return con . info ( ) . get ( ' charset ' )
2014-11-19 10:57:40 +00:00
2015-03-10 17:05:02 +00:00
match = re . search ( b ' charset=[ " \' ]?([0-9a-zA-Z-]+) ' , data [ : 1000 ] )
2014-11-19 10:57:40 +00:00
if match :
2015-03-10 17:05:02 +00:00
return match . groups ( ) [ 0 ] . lower ( ) . decode ( )
2014-11-19 10:57:40 +00:00
2015-03-10 17:05:02 +00:00
match = re . search ( b ' encoding=[ " \' ]?([0-9a-zA-Z-]+) ' , data [ : 100 ] )
2014-11-19 10:57:40 +00:00
if match :
2015-03-10 17:05:02 +00:00
return match . groups ( ) [ 0 ] . lower ( ) . decode ( )
2014-11-19 10:57:40 +00:00
2015-03-24 15:22:56 +00:00
return ' utf-8 '
2014-11-19 10:57:40 +00:00
2015-02-25 09:53:36 +00:00
class EncodingFixHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def http_response ( self , req , resp ) :
2015-03-10 17:03:16 +00:00
maintype = resp . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' / ' ) [ 0 ]
2015-03-02 16:55:58 +00:00
if 200 < = resp . code < 300 and maintype == ' text ' :
2014-11-19 10:57:40 +00:00
data = resp . read ( )
enc = detect_encoding ( data , resp )
if enc :
data = data . decode ( enc , ' replace ' )
data = data . encode ( enc )
2015-03-10 17:05:02 +00:00
fp = BytesIO ( data )
2014-11-19 10:57:40 +00:00
old_resp = resp
2015-02-25 09:53:36 +00:00
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
2014-11-19 10:57:40 +00:00
resp . msg = old_resp . msg
return resp
https_response = http_response
2015-02-25 09:53:36 +00:00
class UAHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def __init__ ( self , useragent = None ) :
self . useragent = useragent
def http_request ( self , req ) :
if self . useragent :
req . add_unredirected_header ( ' User-Agent ' , self . useragent )
return req
https_request = http_request
2015-02-25 09:53:36 +00:00
class AutoRefererHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
2015-03-02 16:59:00 +00:00
if req . host != ' feeds.feedburner.com ' :
req . add_unredirected_header ( ' Referer ' , ' http:// %s ' % req . host )
2014-11-19 10:57:40 +00:00
return req
https_request = http_request
2015-02-25 09:53:36 +00:00
class ContentNegociationHandler ( BaseHandler ) : #FIXME
2014-11-19 10:57:40 +00:00
def __init__ ( self , accept = None , strict = False ) :
self . accept = accept
self . strict = strict
def http_request ( self , req ) :
if self . accept is not None :
if isinstance ( self . accept , basestring ) :
self . accept = ( self . accept , )
out = { }
rank = 1.1
for group in self . accept :
rank - = 0.1
if isinstance ( group , basestring ) :
if group in MIMETYPE :
group = MIMETYPE [ group ]
else :
out [ group ] = rank
continue
for mime in group :
if mime not in out :
out [ mime ] = rank
if not self . strict :
out [ ' */* ' ] = rank - 0.1
string = ' , ' . join ( [ x + ' ;q= {0:.1} ' . format ( out [ x ] ) if out [ x ] != 1 else x for x in out ] )
req . add_unredirected_header ( ' Accept ' , string )
return req
https_request = http_request
2015-04-06 15:03:17 +00:00
class HTTPEquivHandler ( BaseHandler ) :
" Handler to support <meta http-equiv= ' ... ' content= ' ... ' />, since it defines HTTP headers "
handler_order = 600
2014-11-19 10:57:40 +00:00
def http_response ( self , req , resp ) :
2015-03-10 17:03:16 +00:00
contenttype = resp . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' ; ' ) [ 0 ]
2015-03-02 16:55:58 +00:00
if 200 < = resp . code < 300 and contenttype . startswith ( ' text/ ' ) :
if contenttype in MIMETYPE [ ' html ' ] :
2014-11-19 10:57:40 +00:00
data = resp . read ( )
2015-04-06 15:03:17 +00:00
regex = r ' (?i)<meta \ s+http-equiv=([ " \' ])(?P<key>[^ " \' ]+) \ 1 \ s+content=([ " \' ])(?P<value>[^>]+) \ 3 \ s*/?> '
headers = [ x . groupdict ( ) for x in re . finditer ( regex , data [ : 1000 ] . decode ( ' utf-8 ' , ' replace ' ) ) ]
for header in headers :
resp . headers [ header [ ' key ' ] . lower ( ) ] = header [ ' value ' ]
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
https_response = http_response
class HTTPRefreshHandler ( BaseHandler ) :
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
def http_response ( self , req , resp ) :
if 200 < = resp . code < 300 :
if resp . headers . get ( ' refresh ' ) :
regex = r ' (?i)^(?P<delay>[0-9]+) \ s*; \ s*url=([ " \' ]?)(?P<url>.+) \ 2$ '
match = re . search ( regex , resp . headers . get ( ' refresh ' ) )
2014-11-19 10:57:40 +00:00
if match :
2015-04-06 15:03:17 +00:00
url = match . groupdict ( ) [ ' url ' ]
if url :
resp . code = 302
resp . msg = ' Moved Temporarily '
resp . headers [ ' location ' ] = url
2014-11-19 10:57:40 +00:00
return resp
https_response = http_response
2015-04-06 15:26:12 +00:00
class NotInCache ( IOError ) :
pass
class BaseCacheHandler ( BaseHandler ) :
" Cache based on etags/last-modified. Inherit from this to implement actual storage "
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
handler_order = 499
def __init__ ( self , force_min = None ) :
self . force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache, -3 is like -2 but raises an error
def _load ( self , url ) :
out = list ( self . load ( url ) )
if sys . version > ' 3 ' :
out [ 2 ] = email . message_from_string ( out [ 2 ] or unicode ( ) ) # headers
else :
out [ 2 ] = mimetools . Message ( StringIO ( out [ 2 ] or unicode ( ) ) )
out [ 3 ] = out [ 3 ] or bytes ( ) # data
out [ 4 ] = out [ 4 ] or 0 # timestamp
return out
def load ( self , url ) :
" Return the basic vars (code, msg, headers, data, timestamp) "
return ( None , None , None , None , None )
def _save ( self , url , code , msg , headers , data , timestamp ) :
headers = unicode ( headers )
self . save ( url , code , msg , headers , data , timestamp )
def save ( self , url , code , msg , headers , data , timestamp ) :
" Save values to disk "
pass
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
2015-04-06 15:26:12 +00:00
( code , msg , headers , data , timestamp ) = self . _load ( req . get_full_url ( ) )
if ' etag ' in headers :
req . add_unredirected_header ( ' If-None-Match ' , headers [ ' etag ' ] )
if ' last-modified ' in headers :
req . add_unredirected_header ( ' If-Modified-Since ' , headers . get ( ' last-modified ' ) )
2014-11-19 10:57:40 +00:00
return req
2015-04-06 15:26:12 +00:00
def http_open ( self , req ) :
( code , msg , headers , data , timestamp ) = self . _load ( req . get_full_url ( ) )
# some info needed to process everything
cache_control = parse_http_list ( headers . get ( ' cache-control ' , ( ) ) )
cache_control + = parse_http_list ( headers . get ( ' pragma ' , ( ) ) )
cc_list = [ x for x in cache_control if ' = ' not in x ]
cc_values = parse_keqv_list ( [ x for x in cache_control if ' = ' in x ] )
cache_age = time . time ( ) - timestamp
# list in a simple way what to do when
if self . force_min in ( - 2 , - 3 ) :
if code is not None :
# already in cache, perfect, use cache
pass
else :
# ok then...
if self . force_min == - 2 :
headers [ ' morss ' ] = ' from_cache '
resp = addinfourl ( BytesIO ( ) , headers , req . get_full_url ( ) , 409 )
resp . msg = ' Conflict '
return resp
elif self . force_min == - 3 :
raise NotInCache ( )
elif code is None :
# cache empty, refresh
return None
elif self . force_min == - 1 :
# force use cache
pass
elif self . force_min == 0 :
# force refresh
return None
elif self . force_min is None and ( ' no-cache ' in cc_list
or ' no-store ' in cc_list
or ( ' private ' in cc_list and not self . private ) ) :
# kindly follow web servers indications, refresh
return None
elif ' max-age ' in cc_values and int ( cc_values [ ' max-age ' ] ) > cache_age :
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
pass
elif self . force_min is not None and self . force_min > cache_age :
# still recent enough for us, use cache
pass
else :
# according to the www, we have to refresh when nothing is said
return None
# return the cache as a response
headers [ ' morss ' ] = ' from_cache ' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl ( BytesIO ( data ) , headers , req . get_full_url ( ) , code )
resp . msg = msg
return resp
def http_response ( self , req , resp ) :
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
if resp . code == 304 :
return resp
if ( ' cache-control ' in resp . headers or ' pragma ' in resp . headers ) and self . force_min is None :
cache_control = parse_http_list ( resp . headers . get ( ' cache-control ' , ( ) ) )
cache_control + = parse_http_list ( resp . headers . get ( ' pragma ' , ( ) ) )
cc_list = [ x for x in cache_control if ' = ' not in x ]
if ' no-cache ' in cc_list or ' no-store ' in cc_list or ( ' private ' in cc_list and not self . private ) :
# kindly follow web servers indications
return resp
if resp . headers . get ( ' morss ' ) == ' from_cache ' :
# it comes from cache, so no need to save it again
return resp
# save to disk
data = resp . read ( )
self . _save ( req . get_full_url ( ) , resp . code , resp . msg , resp . headers , data , time . time ( ) )
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2014-11-19 10:57:40 +00:00
def http_error_304 ( self , req , fp , code , msg , headers ) :
2015-04-06 15:26:12 +00:00
( code , msg , headers , data , timestamp ) = self . _load ( req . get_full_url ( ) )
resp = addinfourl ( BytesIO ( data ) , headers , req . get_full_url ( ) , code )
resp . msg = msg
2014-11-19 10:57:40 +00:00
return resp
https_request = http_request
2015-04-06 15:26:12 +00:00
https_open = http_open
https_response = http_response
sqlite_default = ' :memory '
class SQliteCacheHandler ( BaseCacheHandler ) :
def __init__ ( self , force_min = - 1 , filename = None ) :
BaseCacheHandler . __init__ ( self , force_min )
self . con = sqlite3 . connect ( filename or sqlite_default , detect_types = sqlite3 . PARSE_DECLTYPES , check_same_thread = False )
self . con . execute ( ' create table if not exists data (url unicode PRIMARY KEY, code int, msg unicode, headers unicode, data bytes, timestamp int) ' )
self . con . commit ( )
def __del__ ( self ) :
self . con . close ( )
def load ( self , url ) :
row = self . con . execute ( ' select * from data where url=? ' , ( url , ) ) . fetchone ( )
if not row :
return ( None , None , None , None , None )
return row [ 1 : ]
def save ( self , url , code , msg , headers , data , timestamp ) :
data = buffer ( data )
if self . con . execute ( ' select code from data where url=? ' , ( url , ) ) . fetchone ( ) :
self . con . execute ( ' update data set code=?, msg=?, headers=?, data=?, timestamp=? where url=? ' ,
( code , msg , headers , data , timestamp , url ) )
else :
self . con . execute ( ' insert into data values (?,?,?,?,?,?) ' , ( url , code , msg , headers , data , timestamp ) )
self . con . commit ( )