2015-04-06 15:26:12 +00:00
import sys
2017-11-25 18:57:41 +00:00
import zlib
2015-04-06 15:26:12 +00:00
from io import BytesIO , StringIO
import re
2017-03-08 21:37:12 +00:00
import chardet
2017-10-27 21:14:08 +00:00
from cgi import parse_header
2017-03-09 03:50:57 +00:00
import lxml . html
2015-04-06 15:26:12 +00:00
import time
2020-04-24 09:28:39 +00:00
import random
2015-02-25 09:53:36 +00:00
try :
2018-10-24 23:14:46 +00:00
# python 2
2017-03-26 05:51:42 +00:00
from urllib2 import BaseHandler , HTTPCookieProcessor , Request , addinfourl , parse_keqv_list , parse_http_list , build_opener
2020-04-28 12:47:23 +00:00
from urllib import quote
from urlparse import urlparse , urlunparse
2015-04-06 15:26:12 +00:00
import mimetools
2015-02-25 10:07:09 +00:00
except ImportError :
2018-10-24 23:14:46 +00:00
# python 3
2017-03-26 05:51:42 +00:00
from urllib . request import BaseHandler , HTTPCookieProcessor , Request , addinfourl , parse_keqv_list , parse_http_list , build_opener
2020-04-28 12:47:23 +00:00
from urllib . parse import quote
from urllib . parse import urlparse , urlunparse
2015-04-06 15:26:12 +00:00
import email
2014-11-19 10:57:40 +00:00
2015-02-25 16:50:23 +00:00
try :
2018-10-24 23:14:46 +00:00
# python 2
2015-02-25 16:50:23 +00:00
basestring
except NameError :
2018-10-24 23:14:46 +00:00
# python 3
2015-04-06 15:26:12 +00:00
basestring = unicode = str
2015-02-25 16:50:23 +00:00
2014-11-19 10:57:40 +00:00
MIMETYPE = {
2017-03-19 09:00:13 +00:00
' xml ' : [ ' text/xml ' , ' application/xml ' , ' application/rss+xml ' , ' application/rdf+xml ' , ' application/atom+xml ' , ' application/xhtml+xml ' ] ,
2020-04-05 14:05:59 +00:00
' rss ' : [ ' application/rss+xml ' , ' application/rdf+xml ' , ' application/atom+xml ' ] ,
2014-11-19 10:57:40 +00:00
' html ' : [ ' text/html ' , ' application/xhtml+xml ' , ' application/xml ' ] }
2017-03-19 08:51:27 +00:00
2020-04-24 09:28:39 +00:00
DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36 " ,
" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36 " ,
" Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0 " ,
" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36 " ,
" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 " ,
" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15 " ,
" Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 " ,
" Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36 " ,
" Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0 " ,
" Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36 "
]
2017-03-19 08:51:27 +00:00
2020-04-28 20:03:49 +00:00
PROTOCOL = [ ' http ' , ' https ' ]
2020-04-07 08:30:17 +00:00
def get ( * args , * * kwargs ) :
2020-04-28 20:29:07 +00:00
return adv_get ( * args , * * kwargs ) [ ' data ' ]
2020-04-07 08:30:17 +00:00
def adv_get ( url , timeout = None , * args , * * kwargs ) :
2020-04-28 20:03:49 +00:00
url = sanitize_url ( url )
2020-04-28 12:47:23 +00:00
2020-04-07 08:30:17 +00:00
if timeout is None :
con = custom_handler ( * args , * * kwargs ) . open ( url )
else :
con = custom_handler ( * args , * * kwargs ) . open ( url , timeout = timeout )
data = con . read ( )
contenttype = con . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' ; ' ) [ 0 ]
encoding = detect_encoding ( data , con )
2020-04-28 20:29:07 +00:00
return {
' data ' : data ,
' url ' : con . geturl ( ) ,
' con ' : con ,
' contenttype ' : contenttype ,
' encoding ' : encoding
}
2020-04-07 08:30:17 +00:00
2020-04-05 14:05:59 +00:00
def custom_handler ( follow = None , delay = None , encoding = None ) :
2017-03-19 08:51:27 +00:00
handlers = [ ]
2017-03-26 05:51:42 +00:00
# as per urllib2 source code, these Handelers are added first
# *unless* one of the custom handlers inherits from one of them
#
# [ProxyHandler, UnknownHandler, HTTPHandler,
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
# FTPHandler, FileHandler, HTTPErrorProcessor]
# & HTTPSHandler
2017-10-27 21:10:03 +00:00
#handlers.append(DebugHandler())
2020-05-12 17:34:16 +00:00
handlers . append ( SizeLimitHandler ( 500 * 1024 ) ) # 500KiB
2017-03-26 05:51:42 +00:00
handlers . append ( HTTPCookieProcessor ( ) )
2017-03-19 08:51:27 +00:00
handlers . append ( GZIPHandler ( ) )
handlers . append ( HTTPEquivHandler ( ) )
handlers . append ( HTTPRefreshHandler ( ) )
2020-04-24 09:28:39 +00:00
handlers . append ( UAHandler ( random . choice ( DEFAULT_UAS ) ) )
2020-04-05 19:11:57 +00:00
handlers . append ( BrowserlyHeaderHandler ( ) )
2017-03-19 08:51:27 +00:00
handlers . append ( EncodingFixHandler ( encoding ) )
2017-03-19 09:37:51 +00:00
2020-04-05 14:05:59 +00:00
if follow :
handlers . append ( AlternateHandler ( MIMETYPE [ follow ] ) )
2017-03-19 09:37:51 +00:00
2017-11-04 11:41:56 +00:00
handlers . append ( CacheHandler ( force_min = delay ) )
2017-03-19 08:51:27 +00:00
return build_opener ( * handlers )
2020-04-28 12:47:23 +00:00
def is_ascii ( string ) :
# there's a native function in py3, but home-made fix for backward compatibility
try :
string . encode ( ' ascii ' )
except UnicodeError :
return False
else :
return True
2020-04-28 20:03:49 +00:00
def sanitize_url ( url ) :
2020-05-02 17:18:01 +00:00
# make sure the url is unicode, i.e. not bytes
2020-04-28 20:03:49 +00:00
if isinstance ( url , bytes ) :
url = url . decode ( )
2020-05-02 17:18:01 +00:00
# make sure there's a protocol (http://)
2020-04-28 20:03:49 +00:00
if url . split ( ' : ' , 1 ) [ 0 ] not in PROTOCOL :
url = ' http:// ' + url
2020-05-02 17:17:15 +00:00
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re . sub ( ' ^(https?):/([^/]) ' , r ' \ 1:// \ 2 ' , url )
2020-05-02 17:18:01 +00:00
# escape spaces
2020-04-28 20:03:49 +00:00
url = url . replace ( ' ' , ' % 20 ' )
2020-05-02 17:18:01 +00:00
# escape non-ascii unicode characters
2020-04-28 12:47:23 +00:00
# https://stackoverflow.com/a/4391299
parts = list ( urlparse ( url ) )
for i in range ( len ( parts ) ) :
if not is_ascii ( parts [ i ] ) :
if i == 1 :
parts [ i ] = parts [ i ] . encode ( ' idna ' ) . decode ( ' ascii ' )
else :
parts [ i ] = quote ( parts [ i ] . encode ( ' utf-8 ' ) )
return urlunparse ( parts )
2017-10-27 21:10:03 +00:00
class DebugHandler ( BaseHandler ) :
handler_order = 2000
def http_request ( self , req ) :
print ( repr ( req . header_items ( ) ) )
return req
def http_response ( self , req , resp ) :
print ( resp . headers . __dict__ )
return resp
https_request = http_request
https_response = http_response
2017-10-27 21:12:40 +00:00
class SizeLimitHandler ( BaseHandler ) :
""" Limit file size, defaults to 5MiB """
handler_order = 450
def __init__ ( self , limit = 5 * 1024 ^ 2 ) :
self . limit = limit
def http_response ( self , req , resp ) :
data = resp . read ( self . limit )
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
https_response = http_response
2017-11-25 18:57:41 +00:00
def UnGzip ( data ) :
2017-11-04 11:07:08 +00:00
" Supports truncated files "
2017-11-25 18:57:41 +00:00
return zlib . decompressobj ( zlib . MAX_WBITS | 32 ) . decompress ( data )
2017-10-27 21:12:40 +00:00
2015-02-25 09:53:36 +00:00
class GZIPHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
req . add_unredirected_header ( ' Accept-Encoding ' , ' gzip ' )
return req
def http_response ( self , req , resp ) :
if 200 < = resp . code < 300 :
if resp . headers . get ( ' Content-Encoding ' ) == ' gzip ' :
data = resp . read ( )
2017-10-27 21:12:40 +00:00
2017-11-25 18:57:41 +00:00
data = UnGzip ( data )
2017-10-27 21:12:40 +00:00
2017-02-26 04:10:43 +00:00
resp . headers [ ' Content-Encoding ' ] = ' identity '
2014-11-19 10:57:40 +00:00
2015-03-10 17:05:02 +00:00
fp = BytesIO ( data )
2014-11-19 10:57:40 +00:00
old_resp = resp
2015-02-25 09:53:36 +00:00
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
2014-11-19 10:57:40 +00:00
resp . msg = old_resp . msg
return resp
https_response = http_response
https_request = http_request
2017-10-27 21:14:08 +00:00
def detect_encoding ( data , resp = None ) :
2020-04-20 14:14:55 +00:00
enc = detect_raw_encoding ( data , resp )
if enc == ' gb2312 ' :
enc = ' gbk '
return enc
def detect_raw_encoding ( data , resp = None ) :
2017-10-27 21:14:08 +00:00
if resp is not None :
enc = resp . headers . get ( ' charset ' )
if enc is not None :
return enc
enc = parse_header ( resp . headers . get ( ' content-type ' , ' ' ) ) [ 1 ] . get ( ' charset ' )
if enc is not None :
return enc
2014-11-19 10:57:40 +00:00
2015-03-10 17:05:02 +00:00
match = re . search ( b ' charset=[ " \' ]?([0-9a-zA-Z-]+) ' , data [ : 1000 ] )
2014-11-19 10:57:40 +00:00
if match :
2015-03-10 17:05:02 +00:00
return match . groups ( ) [ 0 ] . lower ( ) . decode ( )
2014-11-19 10:57:40 +00:00
2017-03-08 21:37:12 +00:00
match = re . search ( b ' encoding=[ " \' ]?([0-9a-zA-Z-]+) ' , data [ : 1000 ] )
2014-11-19 10:57:40 +00:00
if match :
2015-03-10 17:05:02 +00:00
return match . groups ( ) [ 0 ] . lower ( ) . decode ( )
2014-11-19 10:57:40 +00:00
2017-07-16 21:59:06 +00:00
enc = chardet . detect ( data [ - 2000 : ] ) [ ' encoding ' ]
2017-03-19 08:19:54 +00:00
if enc and enc != ' ascii ' :
return enc
2017-03-08 21:37:12 +00:00
2015-03-24 15:22:56 +00:00
return ' utf-8 '
2014-11-19 10:57:40 +00:00
2015-02-25 09:53:36 +00:00
class EncodingFixHandler ( BaseHandler ) :
2016-01-31 12:52:23 +00:00
def __init__ ( self , encoding = None ) :
self . encoding = encoding
2014-11-19 10:57:40 +00:00
def http_response ( self , req , resp ) :
2015-03-10 17:03:16 +00:00
maintype = resp . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' / ' ) [ 0 ]
2015-03-02 16:55:58 +00:00
if 200 < = resp . code < 300 and maintype == ' text ' :
2014-11-19 10:57:40 +00:00
data = resp . read ( )
2017-03-19 08:23:39 +00:00
if not self . encoding :
enc = detect_encoding ( data , resp )
else :
enc = self . encoding
2014-11-19 10:57:40 +00:00
if enc :
data = data . decode ( enc , ' replace ' )
data = data . encode ( enc )
2015-03-10 17:05:02 +00:00
fp = BytesIO ( data )
2014-11-19 10:57:40 +00:00
old_resp = resp
2015-02-25 09:53:36 +00:00
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
2014-11-19 10:57:40 +00:00
resp . msg = old_resp . msg
return resp
https_response = http_response
2015-02-25 09:53:36 +00:00
class UAHandler ( BaseHandler ) :
2014-11-19 10:57:40 +00:00
def __init__ ( self , useragent = None ) :
self . useragent = useragent
def http_request ( self , req ) :
if self . useragent :
req . add_unredirected_header ( ' User-Agent ' , self . useragent )
return req
https_request = http_request
2020-04-05 19:11:57 +00:00
class BrowserlyHeaderHandler ( BaseHandler ) :
""" Add more headers to look less suspicious """
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
2020-04-05 19:11:57 +00:00
req . add_unredirected_header ( ' Accept ' , ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' )
req . add_unredirected_header ( ' Accept-Language ' , ' en-US,en;q=0.5 ' )
2014-11-19 10:57:40 +00:00
return req
https_request = http_request
2020-04-05 14:05:59 +00:00
class AlternateHandler ( BaseHandler ) :
" Follow <link rel= ' alternate ' type= ' application/rss+xml ' href= ' ... ' /> "
2017-03-09 04:03:34 +00:00
2020-04-05 14:05:59 +00:00
def __init__ ( self , follow = None ) :
self . follow = follow or [ ]
2014-11-19 10:57:40 +00:00
2017-03-09 04:03:34 +00:00
def http_response ( self , req , resp ) :
contenttype = resp . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' ; ' ) [ 0 ]
2020-04-05 14:05:59 +00:00
if 200 < = resp . code < 300 and len ( self . follow ) and contenttype in MIMETYPE [ ' html ' ] and contenttype not in self . follow :
2017-03-09 04:03:34 +00:00
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp . read ( )
links = lxml . html . fromstring ( data [ : 10000 ] ) . findall ( ' .//link[@rel= " alternate " ] ' )
for link in links :
2020-04-05 14:05:59 +00:00
if link . get ( ' type ' , ' ' ) in self . follow :
2017-03-09 04:03:34 +00:00
resp . code = 302
resp . msg = ' Moved Temporarily '
resp . headers [ ' location ' ] = link . get ( ' href ' )
2020-04-23 09:23:45 +00:00
break
2017-03-09 04:03:34 +00:00
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
https_response = http_response
2014-11-19 10:57:40 +00:00
2015-04-06 15:03:17 +00:00
class HTTPEquivHandler ( BaseHandler ) :
" Handler to support <meta http-equiv= ' ... ' content= ' ... ' />, since it defines HTTP headers "
handler_order = 600
2014-11-19 10:57:40 +00:00
def http_response ( self , req , resp ) :
2015-03-10 17:03:16 +00:00
contenttype = resp . info ( ) . get ( ' Content-Type ' , ' ' ) . split ( ' ; ' ) [ 0 ]
2017-03-09 03:50:57 +00:00
if 200 < = resp . code < 300 and contenttype in MIMETYPE [ ' html ' ] :
data = resp . read ( )
2015-04-06 15:03:17 +00:00
2017-03-09 03:50:57 +00:00
headers = lxml . html . fromstring ( data [ : 10000 ] ) . findall ( ' .//meta[@http-equiv] ' )
2015-04-06 15:03:17 +00:00
2017-03-09 03:50:57 +00:00
for header in headers :
resp . headers [ header . get ( ' http-equiv ' ) . lower ( ) ] = header . get ( ' content ' )
2015-04-06 15:03:17 +00:00
2017-03-09 03:50:57 +00:00
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
2015-04-06 15:03:17 +00:00
return resp
https_response = http_response
class HTTPRefreshHandler ( BaseHandler ) :
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
def http_response ( self , req , resp ) :
if 200 < = resp . code < 300 :
if resp . headers . get ( ' refresh ' ) :
regex = r ' (?i)^(?P<delay>[0-9]+) \ s*; \ s*url=([ " \' ]?)(?P<url>.+) \ 2$ '
match = re . search ( regex , resp . headers . get ( ' refresh ' ) )
2014-11-19 10:57:40 +00:00
if match :
2015-04-06 15:03:17 +00:00
url = match . groupdict ( ) [ ' url ' ]
if url :
resp . code = 302
resp . msg = ' Moved Temporarily '
resp . headers [ ' location ' ] = url
2014-11-19 10:57:40 +00:00
return resp
https_response = http_response
2017-11-04 11:41:56 +00:00
default_cache = { }
class CacheHandler ( BaseHandler ) :
" Cache based on etags/last-modified "
2015-04-06 15:26:12 +00:00
private_cache = False # False to behave like a CDN (or if you just don't care), True like a PC
handler_order = 499
2017-11-04 11:41:56 +00:00
def __init__ ( self , cache = None , force_min = None ) :
self . cache = cache or default_cache
2015-04-07 01:38:22 +00:00
self . force_min = force_min # force_min (seconds) to bypass http headers, -1 forever, 0 never, -2 do nothing if not in cache
2015-04-06 15:26:12 +00:00
2017-11-04 11:41:56 +00:00
def load ( self , url ) :
try :
out = list ( self . cache [ url ] )
except KeyError :
out = [ None , None , unicode ( ) , bytes ( ) , 0 ]
2015-04-06 15:26:12 +00:00
2015-08-28 17:29:09 +00:00
if sys . version_info [ 0 ] > = 3 :
2015-04-06 15:26:12 +00:00
out [ 2 ] = email . message_from_string ( out [ 2 ] or unicode ( ) ) # headers
else :
out [ 2 ] = mimetools . Message ( StringIO ( out [ 2 ] or unicode ( ) ) )
return out
def save ( self , url , code , msg , headers , data , timestamp ) :
2017-11-04 12:54:40 +00:00
self . cache [ url ] = ( code , msg , unicode ( headers ) , data , timestamp )
2014-11-19 10:57:40 +00:00
def http_request ( self , req ) :
2017-11-04 11:41:56 +00:00
( code , msg , headers , data , timestamp ) = self . load ( req . get_full_url ( ) )
2015-04-06 15:26:12 +00:00
if ' etag ' in headers :
req . add_unredirected_header ( ' If-None-Match ' , headers [ ' etag ' ] )
if ' last-modified ' in headers :
req . add_unredirected_header ( ' If-Modified-Since ' , headers . get ( ' last-modified ' ) )
2014-11-19 10:57:40 +00:00
return req
2015-04-06 15:26:12 +00:00
def http_open ( self , req ) :
2017-11-04 11:41:56 +00:00
( code , msg , headers , data , timestamp ) = self . load ( req . get_full_url ( ) )
2015-04-06 15:26:12 +00:00
# some info needed to process everything
cache_control = parse_http_list ( headers . get ( ' cache-control ' , ( ) ) )
cache_control + = parse_http_list ( headers . get ( ' pragma ' , ( ) ) )
cc_list = [ x for x in cache_control if ' = ' not in x ]
cc_values = parse_keqv_list ( [ x for x in cache_control if ' = ' in x ] )
cache_age = time . time ( ) - timestamp
# list in a simple way what to do when
2015-05-04 14:25:26 +00:00
if req . get_header ( ' Morss ' ) == ' from_304 ' : # for whatever reason, we need an uppercase
# we're just in the middle of a dirty trick, use cache
pass
elif self . force_min == - 2 :
2015-04-06 15:26:12 +00:00
if code is not None :
# already in cache, perfect, use cache
pass
else :
2015-05-04 14:25:26 +00:00
headers [ ' Morss ' ] = ' from_cache '
2015-04-07 01:38:22 +00:00
resp = addinfourl ( BytesIO ( ) , headers , req . get_full_url ( ) , 409 )
resp . msg = ' Conflict '
return resp
2015-04-06 15:26:12 +00:00
elif code is None :
# cache empty, refresh
return None
elif self . force_min == - 1 :
# force use cache
pass
elif self . force_min == 0 :
# force refresh
return None
2017-03-19 08:18:10 +00:00
elif code == 301 and cache_age < 7 * 24 * 3600 :
# "301 Moved Permanently" has to be cached...as long as we want (awesome HTTP specs), let's say a week (why not?)
# use force_min=0 if you want to bypass this (needed for a proper refresh)
pass
2015-04-06 15:26:12 +00:00
elif self . force_min is None and ( ' no-cache ' in cc_list
or ' no-store ' in cc_list
2020-04-05 14:11:36 +00:00
or ( ' private ' in cc_list and not self . private_cache ) ) :
2015-04-06 15:26:12 +00:00
# kindly follow web servers indications, refresh
return None
elif ' max-age ' in cc_values and int ( cc_values [ ' max-age ' ] ) > cache_age :
# server says it's still fine (and we trust him, if not, use force_min=0), use cache
pass
elif self . force_min is not None and self . force_min > cache_age :
# still recent enough for us, use cache
pass
else :
# according to the www, we have to refresh when nothing is said
return None
# return the cache as a response
headers [ ' morss ' ] = ' from_cache ' # TODO delete the morss header from incoming pages, to avoid websites messing up with us
resp = addinfourl ( BytesIO ( data ) , headers , req . get_full_url ( ) , code )
resp . msg = msg
return resp
def http_response ( self , req , resp ) :
# code for after-fetch, to know whether to save to hard-drive (if stiking to http headers' will)
if resp . code == 304 :
return resp
if ( ' cache-control ' in resp . headers or ' pragma ' in resp . headers ) and self . force_min is None :
cache_control = parse_http_list ( resp . headers . get ( ' cache-control ' , ( ) ) )
cache_control + = parse_http_list ( resp . headers . get ( ' pragma ' , ( ) ) )
cc_list = [ x for x in cache_control if ' = ' not in x ]
2020-04-05 14:11:36 +00:00
if ' no-cache ' in cc_list or ' no-store ' in cc_list or ( ' private ' in cc_list and not self . private_cache ) :
2015-04-06 15:26:12 +00:00
# kindly follow web servers indications
return resp
2015-05-04 14:25:26 +00:00
if resp . headers . get ( ' Morss ' ) == ' from_cache ' :
2015-04-06 15:26:12 +00:00
# it comes from cache, so no need to save it again
return resp
# save to disk
data = resp . read ( )
2017-11-04 11:41:56 +00:00
self . save ( req . get_full_url ( ) , resp . code , resp . msg , resp . headers , data , time . time ( ) )
2015-04-06 15:26:12 +00:00
fp = BytesIO ( data )
old_resp = resp
resp = addinfourl ( fp , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2014-11-19 10:57:40 +00:00
def http_error_304 ( self , req , fp , code , msg , headers ) :
2017-11-04 11:41:56 +00:00
cache = list ( self . load ( req . get_full_url ( ) ) )
2015-04-06 15:26:12 +00:00
2015-05-04 14:25:26 +00:00
if cache [ 0 ] :
cache [ - 1 ] = time . time ( )
2017-11-04 11:41:56 +00:00
self . save ( req . get_full_url ( ) , * cache )
2015-04-06 15:26:12 +00:00
2015-05-04 14:25:26 +00:00
new = Request ( req . get_full_url ( ) ,
headers = req . headers ,
unverifiable = True )
new . add_unredirected_header ( ' Morss ' , ' from_304 ' )
return self . parent . open ( new , timeout = req . timeout )
return None
2014-11-19 10:57:40 +00:00
https_request = http_request
2015-04-06 15:26:12 +00:00
https_open = http_open
https_response = http_response
2017-11-04 13:48:00 +00:00
class BaseCache :
2020-04-19 10:54:02 +00:00
""" Subclasses must behave like a dict """
2017-11-04 13:48:00 +00:00
def __contains__ ( self , url ) :
try :
self [ url ]
except KeyError :
return False
else :
return True
2017-11-04 11:41:56 +00:00
import sqlite3
2015-04-06 15:26:12 +00:00
2017-11-04 13:48:00 +00:00
class SQLiteCache ( BaseCache ) :
2017-11-04 11:41:56 +00:00
def __init__ ( self , filename = ' :memory: ' ) :
2020-04-18 19:40:02 +00:00
self . con = sqlite3 . connect ( filename , detect_types = sqlite3 . PARSE_DECLTYPES , check_same_thread = False )
2017-10-27 23:28:47 +00:00
with self . con :
2017-11-04 12:54:40 +00:00
self . con . execute ( ' CREATE TABLE IF NOT EXISTS data (url UNICODE PRIMARY KEY, code INT, msg UNICODE, headers UNICODE, data BLOB, timestamp INT) ' )
2017-10-27 23:28:47 +00:00
self . con . execute ( ' pragma journal_mode=WAL ' )
2015-04-06 15:26:12 +00:00
def __del__ ( self ) :
self . con . close ( )
2017-11-04 11:36:58 +00:00
def __getitem__ ( self , url ) :
row = self . con . execute ( ' SELECT * FROM data WHERE url=? ' , ( url , ) ) . fetchone ( )
2015-04-06 15:26:12 +00:00
if not row :
2017-11-04 11:41:56 +00:00
raise KeyError
2015-04-06 15:26:12 +00:00
return row [ 1 : ]
2017-11-04 11:41:56 +00:00
def __setitem__ ( self , url , value ) : # value = (code, msg, headers, data, timestamp)
2017-11-25 18:58:14 +00:00
value = list ( value )
value [ 3 ] = sqlite3 . Binary ( value [ 3 ] ) # data
value = tuple ( value )
2020-05-03 19:27:45 +00:00
with self . con :
self . con . execute ( ' INSERT INTO data VALUES (?,?,?,?,?,?) ON CONFLICT(url) DO UPDATE SET code=?, msg=?, headers=?, data=?, timestamp=? ' , ( url , ) + value + value )
2017-10-27 23:28:47 +00:00
2015-04-06 15:26:12 +00:00
2017-11-04 13:51:41 +00:00
import pymysql . cursors
class MySQLCacheHandler ( BaseCache ) :
def __init__ ( self , user , password , database , host = ' localhost ' ) :
2020-04-12 10:53:05 +00:00
self . user = user
self . password = password
self . database = database
self . host = host
2017-11-04 13:51:41 +00:00
2020-04-12 10:53:05 +00:00
with self . cursor ( ) as cursor :
2017-11-04 13:51:41 +00:00
cursor . execute ( ' CREATE TABLE IF NOT EXISTS data (url VARCHAR(255) NOT NULL PRIMARY KEY, code INT, msg TEXT, headers TEXT, data BLOB, timestamp INT) ' )
2020-04-12 10:53:05 +00:00
def cursor ( self ) :
return pymysql . connect ( host = self . host , user = self . user , password = self . password , database = self . database , charset = ' utf8 ' , autocommit = True ) . cursor ( )
2017-11-04 13:51:41 +00:00
def __getitem__ ( self , url ) :
2020-04-12 10:53:05 +00:00
cursor = self . cursor ( )
2017-11-04 13:51:41 +00:00
cursor . execute ( ' SELECT * FROM data WHERE url= %s ' , ( url , ) )
row = cursor . fetchone ( )
if not row :
raise KeyError
return row [ 1 : ]
def __setitem__ ( self , url , value ) : # (code, msg, headers, data, timestamp)
2020-05-03 19:27:45 +00:00
with self . cursor ( ) as cursor :
cursor . execute ( ' INSERT INTO data VALUES ( %s , %s , %s , %s , %s , %s ) ON DUPLICATE KEY UPDATE code= %s , msg= %s , headers= %s , data= %s , timestamp= %s ' ,
( url , ) + value + value )
2020-04-27 15:19:31 +00:00
if __name__ == ' __main__ ' :
2020-04-28 20:29:07 +00:00
req = adv_get ( sys . argv [ 1 ] if len ( sys . argv ) > 1 else ' https://morss.it ' )
2020-04-27 16:00:14 +00:00
if not sys . flags . interactive :
2020-04-28 20:29:07 +00:00
print ( req [ ' data ' ] . decode ( req [ ' encoding ' ] ) )