2020-08-26 18:08:22 +00:00
|
|
|
# This file is part of morss
|
|
|
|
#
|
|
|
|
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify it under
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free
|
|
|
|
# Software Foundation, either version 3 of the License, or (at your option) any
|
|
|
|
# later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
|
|
|
|
# details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Affero General Public License along
|
|
|
|
# with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
2020-08-23 16:45:44 +00:00
|
|
|
import os
|
2021-09-11 11:10:42 +00:00
|
|
|
import pickle
|
2021-09-08 18:54:34 +00:00
|
|
|
import random
|
|
|
|
import re
|
2021-11-10 22:08:31 +00:00
|
|
|
import sys
|
2021-09-08 18:54:34 +00:00
|
|
|
import time
|
2017-11-25 18:57:41 +00:00
|
|
|
import zlib
|
2017-10-27 21:14:08 +00:00
|
|
|
from cgi import parse_header
|
2020-09-30 21:59:55 +00:00
|
|
|
from collections import OrderedDict
|
2021-09-08 18:54:34 +00:00
|
|
|
from io import BytesIO, StringIO
|
|
|
|
|
|
|
|
import chardet
|
2015-02-25 09:53:36 +00:00
|
|
|
|
2021-09-18 14:08:01 +00:00
|
|
|
from .caching import default_cache
|
2021-09-11 11:20:34 +00:00
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
try:
|
2018-10-24 23:14:46 +00:00
|
|
|
# python 2
|
2020-04-28 12:47:23 +00:00
|
|
|
from urllib import quote
|
2021-09-08 18:54:34 +00:00
|
|
|
|
2021-11-10 22:08:31 +00:00
|
|
|
from httplib import HTTPMessage
|
2021-09-11 09:34:16 +00:00
|
|
|
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
|
|
|
|
Request, addinfourl, build_opener, parse_http_list,
|
|
|
|
parse_keqv_list)
|
2021-09-08 18:54:34 +00:00
|
|
|
from urlparse import urlparse, urlunparse
|
2015-02-25 10:07:09 +00:00
|
|
|
except ImportError:
|
2018-10-24 23:14:46 +00:00
|
|
|
# python 3
|
2021-11-07 18:44:36 +00:00
|
|
|
from email import message_from_string
|
2021-11-10 22:08:31 +00:00
|
|
|
from http.client import HTTPMessage
|
2021-09-08 18:54:34 +00:00
|
|
|
from urllib.parse import quote, urlparse, urlunparse
|
2021-09-11 09:34:16 +00:00
|
|
|
from urllib.request import (BaseHandler, HTTPCookieProcessor,
|
|
|
|
HTTPRedirectHandler, Request, addinfourl,
|
|
|
|
build_opener, parse_http_list, parse_keqv_list)
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-02-25 16:50:23 +00:00
|
|
|
try:
|
2018-10-24 23:14:46 +00:00
|
|
|
# python 2
|
2015-02-25 16:50:23 +00:00
|
|
|
basestring
|
|
|
|
except NameError:
|
2018-10-24 23:14:46 +00:00
|
|
|
# python 3
|
2015-04-06 15:26:12 +00:00
|
|
|
basestring = unicode = str
|
2015-02-25 16:50:23 +00:00
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
MIMETYPE = {
|
2017-03-19 09:00:13 +00:00
|
|
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
|
2020-04-05 14:05:59 +00:00
|
|
|
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
2022-01-23 10:44:07 +00:00
|
|
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
|
|
|
|
'json': ['application/json'],
|
|
|
|
}
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2017-03-19 08:51:27 +00:00
|
|
|
|
2020-04-24 09:28:39 +00:00
|
|
|
DEFAULT_UAS = [
|
|
|
|
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
|
|
|
|
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
|
|
|
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
|
|
|
|
]
|
2017-03-19 08:51:27 +00:00
|
|
|
|
|
|
|
|
2020-04-28 20:03:49 +00:00
|
|
|
PROTOCOL = ['http', 'https']
|
|
|
|
|
|
|
|
|
2020-04-07 08:30:17 +00:00
|
|
|
def get(*args, **kwargs):
|
2020-04-28 20:29:07 +00:00
|
|
|
return adv_get(*args, **kwargs)['data']
|
2020-04-07 08:30:17 +00:00
|
|
|
|
|
|
|
|
2021-09-08 18:43:21 +00:00
|
|
|
def adv_get(url, post=None, timeout=None, *args, **kwargs):
|
2020-04-28 20:03:49 +00:00
|
|
|
url = sanitize_url(url)
|
2020-04-28 12:47:23 +00:00
|
|
|
|
2021-09-08 18:43:21 +00:00
|
|
|
if post is not None:
|
|
|
|
post = post.encode('utf-8')
|
|
|
|
|
2020-04-07 08:30:17 +00:00
|
|
|
if timeout is None:
|
2021-09-08 18:43:21 +00:00
|
|
|
con = custom_opener(*args, **kwargs).open(url, data=post)
|
2020-04-07 08:30:17 +00:00
|
|
|
|
|
|
|
else:
|
2021-09-08 18:43:21 +00:00
|
|
|
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
|
2020-04-07 08:30:17 +00:00
|
|
|
|
|
|
|
data = con.read()
|
|
|
|
|
|
|
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
|
|
|
encoding= detect_encoding(data, con)
|
|
|
|
|
2020-04-28 20:29:07 +00:00
|
|
|
return {
|
2021-11-10 22:25:03 +00:00
|
|
|
'data': data,
|
2020-04-28 20:29:07 +00:00
|
|
|
'url': con.geturl(),
|
|
|
|
'con': con,
|
|
|
|
'contenttype': contenttype,
|
|
|
|
'encoding': encoding
|
|
|
|
}
|
2020-04-07 08:30:17 +00:00
|
|
|
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
|
2017-03-26 05:51:42 +00:00
|
|
|
# as per urllib2 source code, these Handelers are added first
|
|
|
|
# *unless* one of the custom handlers inherits from one of them
|
|
|
|
#
|
|
|
|
# [ProxyHandler, UnknownHandler, HTTPHandler,
|
|
|
|
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
|
|
|
|
# FTPHandler, FileHandler, HTTPErrorProcessor]
|
|
|
|
# & HTTPSHandler
|
2021-03-25 22:49:58 +00:00
|
|
|
#
|
|
|
|
# when processing a request:
|
|
|
|
# (1) all the *_request are run
|
|
|
|
# (2) the *_open are run until sth is returned (other than None)
|
|
|
|
# (3) all the *_response are run
|
|
|
|
#
|
|
|
|
# During (3), if an http error occurs (i.e. not a 2XX response code), the
|
|
|
|
# http_error_* are run until sth is returned (other than None). If they all
|
|
|
|
# return nothing, a python error is raised
|
2017-03-26 05:51:42 +00:00
|
|
|
|
2022-01-08 21:24:56 +00:00
|
|
|
handlers = [
|
|
|
|
#DebugHandler(),
|
2022-01-19 12:41:12 +00:00
|
|
|
SizeLimitHandler(500*1024), # 500KiB
|
2022-01-08 21:24:56 +00:00
|
|
|
HTTPCookieProcessor(),
|
|
|
|
GZIPHandler(),
|
|
|
|
HTTPAllRedirectHandler(),
|
|
|
|
HTTPEquivHandler(),
|
|
|
|
HTTPRefreshHandler(),
|
|
|
|
UAHandler(random.choice(DEFAULT_UAS)),
|
|
|
|
BrowserlyHeaderHandler(),
|
|
|
|
EncodingFixHandler(),
|
|
|
|
]
|
2017-03-19 09:37:51 +00:00
|
|
|
|
2020-04-05 14:05:59 +00:00
|
|
|
if follow:
|
|
|
|
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
2017-03-19 09:37:51 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
|
2017-03-19 08:51:27 +00:00
|
|
|
|
|
|
|
return build_opener(*handlers)
|
|
|
|
|
|
|
|
|
2020-04-28 12:47:23 +00:00
|
|
|
def is_ascii(string):
|
|
|
|
# there's a native function in py3, but home-made fix for backward compatibility
|
|
|
|
try:
|
|
|
|
string.encode('ascii')
|
|
|
|
|
|
|
|
except UnicodeError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2020-04-28 20:03:49 +00:00
|
|
|
def sanitize_url(url):
|
2020-05-02 17:18:01 +00:00
|
|
|
# make sure the url is unicode, i.e. not bytes
|
2020-04-28 20:03:49 +00:00
|
|
|
if isinstance(url, bytes):
|
|
|
|
url = url.decode()
|
|
|
|
|
2020-05-02 17:18:01 +00:00
|
|
|
# make sure there's a protocol (http://)
|
2020-04-28 20:03:49 +00:00
|
|
|
if url.split(':', 1)[0] not in PROTOCOL:
|
|
|
|
url = 'http://' + url
|
|
|
|
|
2020-05-02 17:17:15 +00:00
|
|
|
# turns out some websites have really badly fomatted urls (fix http:/badurl)
|
|
|
|
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
|
|
|
|
|
2020-05-02 17:18:01 +00:00
|
|
|
# escape spaces
|
2020-04-28 20:03:49 +00:00
|
|
|
url = url.replace(' ', '%20')
|
|
|
|
|
2020-05-02 17:18:01 +00:00
|
|
|
# escape non-ascii unicode characters
|
2020-04-28 12:47:23 +00:00
|
|
|
# https://stackoverflow.com/a/4391299
|
|
|
|
parts = list(urlparse(url))
|
|
|
|
|
|
|
|
for i in range(len(parts)):
|
|
|
|
if not is_ascii(parts[i]):
|
|
|
|
if i == 1:
|
|
|
|
parts[i] = parts[i].encode('idna').decode('ascii')
|
|
|
|
|
|
|
|
else:
|
|
|
|
parts[i] = quote(parts[i].encode('utf-8'))
|
|
|
|
|
|
|
|
return urlunparse(parts)
|
|
|
|
|
|
|
|
|
2020-10-30 21:15:35 +00:00
|
|
|
class RespDataHandler(BaseHandler):
|
|
|
|
" Make it easier to use the reponse body "
|
|
|
|
|
|
|
|
def data_reponse(self, req, resp, data):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
|
|
|
# read data
|
|
|
|
data = resp.read()
|
|
|
|
|
|
|
|
# process data and use returned content (if any)
|
|
|
|
data = self.data_response(req, resp, data) or data
|
|
|
|
|
|
|
|
# reformat the stuff
|
|
|
|
fp = BytesIO(data)
|
|
|
|
old_resp = resp
|
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
|
|
|
class RespStrHandler(RespDataHandler):
|
|
|
|
" Make it easier to use the _decoded_ reponse body "
|
|
|
|
|
|
|
|
def str_reponse(self, req, resp, data_str):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def data_response(self, req, resp, data):
|
|
|
|
#decode
|
|
|
|
enc = detect_encoding(data, resp)
|
|
|
|
data_str = data.decode(enc, 'replace')
|
|
|
|
|
|
|
|
#process
|
|
|
|
data_str = self.str_response(req, resp, data_str)
|
|
|
|
|
|
|
|
# return
|
|
|
|
data = data_str.encode(enc) if data_str is not None else data
|
|
|
|
|
|
|
|
#return
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
2017-10-27 21:10:03 +00:00
|
|
|
class DebugHandler(BaseHandler):
|
|
|
|
handler_order = 2000
|
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
print(repr(req.header_items()))
|
|
|
|
return req
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
|
|
|
print(resp.headers.__dict__)
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2017-10-27 21:12:40 +00:00
|
|
|
class SizeLimitHandler(BaseHandler):
|
|
|
|
""" Limit file size, defaults to 5MiB """
|
|
|
|
|
|
|
|
handler_order = 450
|
|
|
|
|
2020-10-30 21:12:43 +00:00
|
|
|
def __init__(self, limit=5*1024**2):
|
2017-10-27 21:12:40 +00:00
|
|
|
self.limit = limit
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
|
|
|
data = resp.read(self.limit)
|
|
|
|
|
|
|
|
fp = BytesIO(data)
|
|
|
|
old_resp = resp
|
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2017-11-25 18:57:41 +00:00
|
|
|
def UnGzip(data):
|
2017-11-04 11:07:08 +00:00
|
|
|
" Supports truncated files "
|
2017-11-25 18:57:41 +00:00
|
|
|
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
2017-10-27 21:12:40 +00:00
|
|
|
|
|
|
|
|
2020-10-30 21:16:51 +00:00
|
|
|
class GZIPHandler(RespDataHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_request(self, req):
|
|
|
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
|
|
|
return req
|
|
|
|
|
2020-10-30 21:16:51 +00:00
|
|
|
def data_response(self, req, resp, data):
|
2014-11-19 10:57:40 +00:00
|
|
|
if 200 <= resp.code < 300:
|
|
|
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
2017-02-26 04:10:43 +00:00
|
|
|
resp.headers['Content-Encoding'] = 'identity'
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2020-10-30 21:16:51 +00:00
|
|
|
return UnGzip(data)
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
|
2017-10-27 21:14:08 +00:00
|
|
|
def detect_encoding(data, resp=None):
|
2020-04-20 14:14:55 +00:00
|
|
|
enc = detect_raw_encoding(data, resp)
|
|
|
|
|
2020-05-27 19:34:43 +00:00
|
|
|
if enc.lower() == 'gb2312':
|
2020-04-20 14:14:55 +00:00
|
|
|
enc = 'gbk'
|
|
|
|
|
|
|
|
return enc
|
|
|
|
|
|
|
|
|
|
|
|
def detect_raw_encoding(data, resp=None):
|
2017-10-27 21:14:08 +00:00
|
|
|
if resp is not None:
|
|
|
|
enc = resp.headers.get('charset')
|
|
|
|
if enc is not None:
|
|
|
|
return enc
|
|
|
|
|
|
|
|
enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset')
|
|
|
|
if enc is not None:
|
|
|
|
return enc
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2015-03-10 17:05:02 +00:00
|
|
|
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
2015-03-10 17:05:02 +00:00
|
|
|
return match.groups()[0].lower().decode()
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2017-03-08 21:37:12 +00:00
|
|
|
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000])
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
2015-03-10 17:05:02 +00:00
|
|
|
return match.groups()[0].lower().decode()
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2017-07-16 21:59:06 +00:00
|
|
|
enc = chardet.detect(data[-2000:])['encoding']
|
2017-03-19 08:19:54 +00:00
|
|
|
if enc and enc != 'ascii':
|
|
|
|
return enc
|
2017-03-08 21:37:12 +00:00
|
|
|
|
2015-03-24 15:22:56 +00:00
|
|
|
return 'utf-8'
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
|
2020-10-30 21:16:51 +00:00
|
|
|
class EncodingFixHandler(RespStrHandler):
|
|
|
|
def str_response(self, req, resp, data_str):
|
|
|
|
return data_str
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
|
2015-02-25 09:53:36 +00:00
|
|
|
class UAHandler(BaseHandler):
|
2014-11-19 10:57:40 +00:00
|
|
|
def __init__(self, useragent=None):
|
|
|
|
self.useragent = useragent
|
|
|
|
|
|
|
|
def http_request(self, req):
|
|
|
|
if self.useragent:
|
|
|
|
req.add_unredirected_header('User-Agent', self.useragent)
|
|
|
|
return req
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
2020-04-05 19:11:57 +00:00
|
|
|
class BrowserlyHeaderHandler(BaseHandler):
|
|
|
|
""" Add more headers to look less suspicious """
|
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_request(self, req):
|
2020-04-05 19:11:57 +00:00
|
|
|
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
|
|
|
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
|
2014-11-19 10:57:40 +00:00
|
|
|
return req
|
|
|
|
|
|
|
|
https_request = http_request
|
|
|
|
|
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
def iter_html_tag(html_str, tag_name):
|
2021-08-28 22:18:50 +00:00
|
|
|
" To avoid parsing whole pages when looking for a simple tag "
|
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
|
|
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
|
|
|
|
|
|
|
for tag_match in re.finditer(re_tag, html_str):
|
|
|
|
attr_match = re.findall(re_attr, tag_match.group(0))
|
|
|
|
|
|
|
|
if attr_match is not None:
|
|
|
|
yield dict(attr_match)
|
|
|
|
|
|
|
|
|
|
|
|
class AlternateHandler(RespStrHandler):
|
2020-04-05 14:05:59 +00:00
|
|
|
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
2017-03-09 04:03:34 +00:00
|
|
|
|
2020-04-05 14:05:59 +00:00
|
|
|
def __init__(self, follow=None):
|
|
|
|
self.follow = follow or []
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
def str_response(self, req, resp, data_str):
|
2017-03-09 04:03:34 +00:00
|
|
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
2020-10-30 21:21:19 +00:00
|
|
|
|
2020-04-05 14:05:59 +00:00
|
|
|
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
2017-03-09 04:03:34 +00:00
|
|
|
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
for link in iter_html_tag(data_str[:10000], 'link'):
|
|
|
|
if (link.get('rel') == 'alternate'
|
|
|
|
and link.get('type') in self.follow
|
|
|
|
and 'href' in link):
|
|
|
|
resp.code = 302
|
|
|
|
resp.msg = 'Moved Temporarily'
|
|
|
|
resp.headers['location'] = link.get('href')
|
|
|
|
break
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
class HTTPEquivHandler(RespStrHandler):
|
2015-04-06 15:03:17 +00:00
|
|
|
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
|
|
|
|
|
|
|
handler_order = 600
|
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
def str_response(self, req, resp, data_str):
|
2015-03-10 17:03:16 +00:00
|
|
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
2017-03-09 03:50:57 +00:00
|
|
|
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
2020-07-06 10:25:38 +00:00
|
|
|
|
2020-10-30 21:21:19 +00:00
|
|
|
for meta in iter_html_tag(data_str[:10000], 'meta'):
|
|
|
|
if 'http-equiv' in meta and 'content' in meta:
|
|
|
|
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
2015-04-06 15:03:17 +00:00
|
|
|
|
|
|
|
|
2021-09-11 09:34:16 +00:00
|
|
|
class HTTPAllRedirectHandler(HTTPRedirectHandler):
|
|
|
|
def http_error_308(self, req, fp, code, msg, headers):
|
|
|
|
return self.http_error_301(req, fp, 301, msg, headers)
|
|
|
|
|
|
|
|
|
2015-04-06 15:03:17 +00:00
|
|
|
class HTTPRefreshHandler(BaseHandler):
|
|
|
|
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
|
|
|
if 200 <= resp.code < 300:
|
|
|
|
if resp.headers.get('refresh'):
|
|
|
|
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url=(["\']?)(?P<url>.+)\2$'
|
|
|
|
match = re.search(regex, resp.headers.get('refresh'))
|
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
if match:
|
2015-04-06 15:03:17 +00:00
|
|
|
url = match.groupdict()['url']
|
|
|
|
|
|
|
|
if url:
|
|
|
|
resp.code = 302
|
|
|
|
resp.msg = 'Moved Temporarily'
|
|
|
|
resp.headers['location'] = url
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
return resp
|
|
|
|
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2021-11-10 22:25:03 +00:00
|
|
|
def parse_headers(text=u'\n\n'):
|
2021-11-10 22:08:31 +00:00
|
|
|
if sys.version_info[0] >= 3:
|
|
|
|
# python 3
|
2021-11-11 09:21:48 +00:00
|
|
|
return message_from_string(text, _class=HTTPMessage)
|
2021-11-10 22:08:31 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
# python 2
|
|
|
|
return HTTPMessage(StringIO(text))
|
|
|
|
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
def error_response(code, msg, url=''):
|
|
|
|
# return an error as a response
|
2021-11-10 22:08:31 +00:00
|
|
|
resp = addinfourl(BytesIO(), parse_headers(), url, code)
|
2021-11-08 21:02:23 +00:00
|
|
|
resp.msg = msg
|
|
|
|
return resp
|
|
|
|
|
|
|
|
|
2017-11-04 11:41:56 +00:00
|
|
|
class CacheHandler(BaseHandler):
|
|
|
|
" Cache based on etags/last-modified "
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
privacy = 'private' # Websites can indicate whether the page should be cached
|
|
|
|
# by CDNs (e.g. shouldn't be the case for
|
|
|
|
# private/confidential/user-specific pages. With this
|
|
|
|
# setting, decide whether you want the cache to behave
|
|
|
|
# like a CDN (i.e. don't cache private pages, 'public'),
|
|
|
|
# or to behave like a end-user private pages
|
|
|
|
# ('private'). If unsure, 'public' is the safest bet,
|
|
|
|
# but many websites abuse this feature...
|
|
|
|
|
|
|
|
# NB. This overrides all the other min/max/policy settings.
|
2015-04-06 15:26:12 +00:00
|
|
|
handler_order = 499
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
|
2017-11-04 11:41:56 +00:00
|
|
|
self.cache = cache or default_cache
|
2020-05-12 18:44:25 +00:00
|
|
|
self.force_min = force_min
|
2021-11-08 21:02:23 +00:00
|
|
|
self.force_max = force_max
|
|
|
|
self.policy = policy # can be cached/refresh/offline/None (default)
|
|
|
|
|
|
|
|
# Servers indicate how long they think their content is "valid". With
|
|
|
|
# this parameter (force_min/max, expressed in seconds), we can override
|
|
|
|
# the validity period (i.e. bypassing http headers)
|
|
|
|
# Special choices, via "policy":
|
|
|
|
# cached: use the cache no matter what (and fetch the page online if
|
|
|
|
# not present in cache)
|
|
|
|
# refresh: valid zero second, i.e. force refresh
|
|
|
|
# offline: same as cached, i.e. use the cache no matter what, but do
|
|
|
|
# NOT fetch the page online if not present in cache, throw an
|
|
|
|
# error instead
|
|
|
|
# None: just follow protocols
|
|
|
|
|
|
|
|
# sanity checks
|
|
|
|
assert self.force_max is None or self.force_max >= 0
|
|
|
|
assert self.force_min is None or self.force_min >= 0
|
|
|
|
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2017-11-04 11:41:56 +00:00
|
|
|
def load(self, url):
|
|
|
|
try:
|
2021-09-11 11:10:42 +00:00
|
|
|
data = pickle.loads(self.cache[url])
|
|
|
|
|
2017-11-04 11:41:56 +00:00
|
|
|
except KeyError:
|
2021-09-11 11:10:42 +00:00
|
|
|
data = None
|
2015-04-06 15:26:12 +00:00
|
|
|
|
|
|
|
else:
|
2021-11-10 22:08:31 +00:00
|
|
|
data['headers'] = parse_headers(data['headers'] or unicode())
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
return data
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
def save(self, key, data):
|
|
|
|
data['headers'] = unicode(data['headers'])
|
|
|
|
self.cache[key] = pickle.dumps(data, 0)
|
2014-11-19 10:57:40 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
def cached_response(self, req, fallback=None):
|
2021-12-31 18:28:11 +00:00
|
|
|
req.from_morss_cache = True
|
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
data = self.load(req.get_full_url())
|
2021-03-25 22:54:08 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
if data is not None:
|
|
|
|
# return the cache as a response
|
|
|
|
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
|
|
|
|
resp.msg = data['msg']
|
|
|
|
return resp
|
2021-03-25 22:54:08 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
else:
|
|
|
|
return fallback
|
2021-03-25 22:54:08 +00:00
|
|
|
|
|
|
|
def save_response(self, req, resp):
|
2021-12-31 18:28:11 +00:00
|
|
|
if req.from_morss_cache:
|
|
|
|
# do not re-save (would reset the timing)
|
|
|
|
return resp
|
|
|
|
|
2021-03-25 22:54:08 +00:00
|
|
|
data = resp.read()
|
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
self.save(req.get_full_url(), {
|
|
|
|
'code': resp.code,
|
|
|
|
'msg': resp.msg,
|
2021-11-10 22:08:31 +00:00
|
|
|
'headers': resp.headers,
|
2021-09-11 11:10:42 +00:00
|
|
|
'data': data,
|
|
|
|
'timestamp': time.time()
|
|
|
|
})
|
2021-03-25 22:54:08 +00:00
|
|
|
|
|
|
|
fp = BytesIO(data)
|
|
|
|
old_resp = resp
|
|
|
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
|
|
resp.msg = old_resp.msg
|
|
|
|
|
|
|
|
return resp
|
|
|
|
|
2014-11-19 10:57:40 +00:00
|
|
|
def http_request(self, req):
|
2021-12-31 18:28:11 +00:00
|
|
|
req.from_morss_cache = False # to track whether it comes from cache
|
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
data = self.load(req.get_full_url())
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
if data is not None:
|
|
|
|
if 'etag' in data['headers']:
|
|
|
|
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
if 'last-modified' in data['headers']:
|
|
|
|
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
return req
|
|
|
|
|
2015-04-06 15:26:12 +00:00
|
|
|
def http_open(self, req):
|
2020-05-12 18:44:25 +00:00
|
|
|
# Reminder of how/when this function is called by urllib2:
|
|
|
|
# If 'None' is returned, try your chance with the next-available handler
|
|
|
|
# If a 'resp' is returned, stop there, and proceed with 'http_response'
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
# Here, we try to see whether we want to use data from cache (i.e.
|
|
|
|
# return 'resp'), or whether we want to refresh the content (return
|
|
|
|
# 'None')
|
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
data = self.load(req.get_full_url())
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
if data is not None:
|
|
|
|
# some info needed to process everything
|
|
|
|
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
|
|
|
|
cache_control += parse_http_list(data['headers'].get('pragma', ()))
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
cc_list = [x for x in cache_control if '=' not in x]
|
|
|
|
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
cache_age = time.time() - data['timestamp']
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
# list in a simple way what to do in special cases
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
if data is not None and 'private' in cc_list and self.privacy == 'public':
|
|
|
|
# private data but public cache, do not use cache
|
|
|
|
# privacy concern, so handled first and foremost
|
|
|
|
# (and doesn't need to be addressed anymore afterwards)
|
|
|
|
return None
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif self.policy == 'offline':
|
|
|
|
# use cache, or return an error
|
|
|
|
return self.cached_response(
|
|
|
|
req,
|
|
|
|
error_response(409, 'Conflict', req.get_full_url())
|
|
|
|
)
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif self.policy == 'cached':
|
|
|
|
# use cache, or fetch online
|
|
|
|
return self.cached_response(req, None)
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif self.policy == 'refresh':
|
2015-04-06 15:26:12 +00:00
|
|
|
# force refresh
|
|
|
|
return None
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif data is None:
|
|
|
|
# we have already settled all the cases that don't need the cache.
|
|
|
|
# all the following ones need the cached item
|
|
|
|
return None
|
|
|
|
|
|
|
|
elif self.force_max is not None and cache_age > self.force_max:
|
|
|
|
# older than we want, refresh
|
|
|
|
return None
|
|
|
|
|
|
|
|
elif self.force_min is not None and cache_age < self.force_min:
|
|
|
|
# recent enough, use cache
|
|
|
|
return self.cached_response(req)
|
|
|
|
|
2021-09-11 11:10:42 +00:00
|
|
|
elif data['code'] == 301 and cache_age < 7*24*3600:
|
2020-05-12 18:44:25 +00:00
|
|
|
# "301 Moved Permanently" has to be cached...as long as we want
|
|
|
|
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
|
|
|
|
# if you want to bypass this (needed for a proper refresh)
|
2021-03-25 22:54:08 +00:00
|
|
|
return self.cached_response(req)
|
2017-03-19 08:18:10 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
|
|
|
|
# kindly follow web servers indications, refresh if the same
|
|
|
|
# settings are used all along, this section shouldn't be of any use,
|
|
|
|
# since the page woudln't be cached in the first place the check is
|
|
|
|
# only performed "just in case"
|
|
|
|
# NB. NOT respected if force_min is set
|
2015-04-06 15:26:12 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
|
2021-11-08 21:02:23 +00:00
|
|
|
# server says it's still fine (and we trust him, if not, use overrides), use cache
|
2021-03-25 22:54:08 +00:00
|
|
|
return self.cached_response(req)
|
2015-04-06 15:26:12 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
# according to the www, we have to refresh when nothing is said
|
|
|
|
return None
|
|
|
|
|
|
|
|
def http_response(self, req, resp):
|
2021-12-31 18:28:11 +00:00
|
|
|
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
if resp.code == 304 and resp.url in self.cache:
|
2021-03-25 22:54:08 +00:00
|
|
|
# we are hopefully the first after the HTTP handler, so no need
|
|
|
|
# to re-run all the *_response
|
|
|
|
# here: cached page, returning from cache
|
|
|
|
return self.cached_response(req)
|
2015-04-06 15:26:12 +00:00
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
|
2015-04-06 15:26:12 +00:00
|
|
|
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
|
|
|
|
cache_control += parse_http_list(resp.headers.get('pragma', ()))
|
|
|
|
|
|
|
|
cc_list = [x for x in cache_control if '=' not in x]
|
|
|
|
|
2021-11-08 21:02:23 +00:00
|
|
|
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
|
2021-03-25 22:54:08 +00:00
|
|
|
# kindly follow web servers indications (do not save & return)
|
2015-04-06 15:26:12 +00:00
|
|
|
return resp
|
|
|
|
|
2021-03-25 22:54:08 +00:00
|
|
|
else:
|
|
|
|
# save
|
|
|
|
return self.save_response(req, resp)
|
2015-05-04 14:25:26 +00:00
|
|
|
|
2021-03-25 22:54:08 +00:00
|
|
|
else:
|
|
|
|
return self.save_response(req, resp)
|
2014-11-19 10:57:40 +00:00
|
|
|
|
|
|
|
https_request = http_request
|
2015-04-06 15:26:12 +00:00
|
|
|
https_open = http_open
|
|
|
|
https_response = http_response
|
|
|
|
|
|
|
|
|
2020-10-03 17:57:08 +00:00
|
|
|
if 'IGNORE_SSL' in os.environ:
|
|
|
|
import ssl
|
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
|
|
|
|
|
|
|
2020-04-27 15:19:31 +00:00
|
|
|
if __name__ == '__main__':
|
2020-04-28 20:29:07 +00:00
|
|
|
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
|
2020-04-27 16:00:14 +00:00
|
|
|
|
2020-05-26 17:34:20 +00:00
|
|
|
if sys.flags.interactive:
|
|
|
|
print('>>> Interactive shell: try using `req`')
|
|
|
|
|
|
|
|
else:
|
2020-04-28 20:29:07 +00:00
|
|
|
print(req['data'].decode(req['encoding']))
|