# This file is part of morss # # Copyright (C) 2013-2020 pictuga # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more # details. # # You should have received a copy of the GNU Affero General Public License along # with this program. If not, see . import os import pickle import random import re import sys import time import zlib from cgi import parse_header from collections import OrderedDict from io import BytesIO, StringIO import chardet from .caching import default_cache try: # python 2 from urllib import quote from httplib import HTTPMessage from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, Request, addinfourl, build_opener, parse_http_list, parse_keqv_list) from urlparse import urlsplit except ImportError: # python 3 from email import message_from_string from http.client import HTTPMessage from urllib.parse import quote, urlsplit from urllib.request import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, Request, addinfourl, build_opener, parse_http_list, parse_keqv_list) try: # python 2 basestring except NameError: # python 3 basestring = unicode = str MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'], 'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'html': ['text/html', 'application/xhtml+xml', 'application/xml'], 'json': ['application/json'], } DEFAULT_UAS = [ #https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36" ] PROTOCOL = ['http', 'https'] def get(*args, **kwargs): return adv_get(*args, **kwargs)['data'] def adv_get(url, post=None, timeout=None, *args, **kwargs): url = sanitize_url(url) if post is not None: post = post.encode('utf-8') if timeout is None: con = custom_opener(*args, **kwargs).open(url, data=post) else: con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout) data = con.read() contenttype = con.info().get('Content-Type', '').split(';')[0] encoding= detect_encoding(data, con) return { 'data': data, 'url': con.geturl(), 'con': con, 'contenttype': contenttype, 'encoding': encoding } def custom_opener(follow=None, policy=None, force_min=None, force_max=None): # as per urllib2 source code, these Handelers are added first # *unless* one of the custom handlers inherits from one of them # # [ProxyHandler, UnknownHandler, HTTPHandler, # HTTPDefaultErrorHandler, HTTPRedirectHandler, # FTPHandler, FileHandler, HTTPErrorProcessor] # & HTTPSHandler # # when processing a request: # (1) all the *_request are run # (2) the *_open are run until sth is returned (other than None) # (3) all the *_response are run # # During (3), if an http error occurs (i.e. not a 2XX response code), the # http_error_* are run until sth is returned (other than None). If they all # return nothing, a python error is raised handlers = [ #DebugHandler(), SizeLimitHandler(500*1024), # 500KiB HTTPCookieProcessor(), GZIPHandler(), HTTPAllRedirectHandler(), HTTPEquivHandler(), HTTPRefreshHandler(), UAHandler(random.choice(DEFAULT_UAS)), BrowserlyHeaderHandler(), EncodingFixHandler(), ] if follow: handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max)) return build_opener(*handlers) def is_ascii(string): # there's a native function in py3, but home-made fix for backward compatibility try: string.encode('ascii') except UnicodeError: return False else: return True def soft_quote(string): " url-quote only when not a valid ascii string " if is_ascii(string): return string else: return quote(string.encode('utf-8')) def sanitize_url(url): # make sure the url is unicode, i.e. not bytes if isinstance(url, bytes): url = url.decode('utf-8') # make sure there's a protocol (http://) if url.split(':', 1)[0] not in PROTOCOL: url = 'http://' + url # turns out some websites have really badly fomatted urls (fix http:/badurl) url = re.sub('^(https?):/([^/])', r'\1://\2', url) # escape spaces url = url.replace(' ', '%20') # escape non-ascii unicode characters parts = urlsplit(url) parts = parts._replace( netloc=parts.netloc.replace( parts.hostname, parts.hostname.encode('idna').decode('ascii') ), path=soft_quote(parts.path), query=soft_quote(parts.query), fragment=soft_quote(parts.fragment), ) return parts.geturl() class RespDataHandler(BaseHandler): " Make it easier to use the reponse body " def data_reponse(self, req, resp, data): pass def http_response(self, req, resp): # read data data = resp.read() # process data and use returned content (if any) data = self.data_response(req, resp, data) or data # reformat the stuff fp = BytesIO(data) old_resp = resp resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp https_response = http_response class RespStrHandler(RespDataHandler): " Make it easier to use the _decoded_ reponse body " def str_reponse(self, req, resp, data_str): pass def data_response(self, req, resp, data): #decode enc = detect_encoding(data, resp) data_str = data.decode(enc, 'replace') #process data_str = self.str_response(req, resp, data_str) # return data = data_str.encode(enc) if data_str is not None else data #return return data class DebugHandler(BaseHandler): handler_order = 2000 def http_request(self, req): print(repr(req.header_items())) return req def http_response(self, req, resp): print(resp.headers.__dict__) return resp https_request = http_request https_response = http_response class SizeLimitHandler(BaseHandler): """ Limit file size, defaults to 5MiB """ handler_order = 450 def __init__(self, limit=5*1024**2): self.limit = limit def http_response(self, req, resp): data = resp.read(self.limit) fp = BytesIO(data) old_resp = resp resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp https_response = http_response def UnGzip(data): " Supports truncated files " return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data) class GZIPHandler(RespDataHandler): def http_request(self, req): req.add_unredirected_header('Accept-Encoding', 'gzip') return req def data_response(self, req, resp, data): if 200 <= resp.code < 300: if resp.headers.get('Content-Encoding') == 'gzip': resp.headers['Content-Encoding'] = 'identity' return UnGzip(data) def detect_encoding(data, resp=None): enc = detect_raw_encoding(data, resp) if enc.lower() == 'gb2312': enc = 'gbk' return enc def detect_raw_encoding(data, resp=None): if resp is not None: enc = resp.headers.get('charset') if enc is not None: return enc enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset') if enc is not None: return enc match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000]) if match: return match.groups()[0].lower().decode() match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000]) if match: return match.groups()[0].lower().decode() enc = chardet.detect(data[-2000:])['encoding'] if enc and enc != 'ascii': return enc return 'utf-8' class EncodingFixHandler(RespStrHandler): def str_response(self, req, resp, data_str): return data_str class UAHandler(BaseHandler): def __init__(self, useragent=None): self.useragent = useragent def http_request(self, req): if self.useragent: req.add_unredirected_header('User-Agent', self.useragent) return req https_request = http_request class BrowserlyHeaderHandler(BaseHandler): """ Add more headers to look less suspicious """ def http_request(self, req): req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5') return req https_request = http_request def iter_html_tag(html_str, tag_name): " To avoid parsing whole pages when looking for a simple tag " re_tag = r'<%s\s+[^>]+>' % tag_name re_attr = r'(?P[^=\s]+)=[\'"](?P[^\'"]+)[\'"]' for tag_match in re.finditer(re_tag, html_str): attr_match = re.findall(re_attr, tag_match.group(0)) if attr_match is not None: yield dict(attr_match) class AlternateHandler(RespStrHandler): " Follow " def __init__(self, follow=None): self.follow = follow or [] def str_response(self, req, resp, data_str): contenttype = resp.info().get('Content-Type', '').split(';')[0] if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow: # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types for link in iter_html_tag(data_str[:10000], 'link'): if (link.get('rel') == 'alternate' and link.get('type') in self.follow and 'href' in link): resp.code = 302 resp.msg = 'Moved Temporarily' resp.headers['location'] = link.get('href') break class HTTPEquivHandler(RespStrHandler): " Handler to support , since it defines HTTP headers " handler_order = 600 def str_response(self, req, resp, data_str): contenttype = resp.info().get('Content-Type', '').split(';')[0] if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: for meta in iter_html_tag(data_str[:10000], 'meta'): if 'http-equiv' in meta and 'content' in meta: resp.headers[meta.get('http-equiv').lower()] = meta.get('content') class HTTPAllRedirectHandler(HTTPRedirectHandler): def http_error_308(self, req, fp, code, msg, headers): return self.http_error_301(req, fp, 301, msg, headers) class HTTPRefreshHandler(BaseHandler): handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000 def http_response(self, req, resp): if 200 <= resp.code < 300: if resp.headers.get('refresh'): regex = r'(?i)^(?P[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P.+)\2$' match = re.search(regex, resp.headers.get('refresh')) if match: url = match.groupdict()['url'] if url: resp.code = 302 resp.msg = 'Moved Temporarily' resp.headers['location'] = url return resp https_response = http_response def parse_headers(text=u'\n\n'): if sys.version_info[0] >= 3: # python 3 return message_from_string(text, _class=HTTPMessage) else: # python 2 return HTTPMessage(StringIO(text)) def error_response(code, msg, url=''): # return an error as a response resp = addinfourl(BytesIO(), parse_headers(), url, code) resp.msg = msg return resp class CacheHandler(BaseHandler): " Cache based on etags/last-modified " privacy = 'private' # Websites can indicate whether the page should be cached # by CDNs (e.g. shouldn't be the case for # private/confidential/user-specific pages. With this # setting, decide whether you want the cache to behave # like a CDN (i.e. don't cache private pages, 'public'), # or to behave like a end-user private pages # ('private'). If unsure, 'public' is the safest bet, # but many websites abuse this feature... # NB. This overrides all the other min/max/policy settings. handler_order = 499 def __init__(self, cache=None, force_min=None, force_max=None, policy=None): self.cache = cache or default_cache self.force_min = force_min self.force_max = force_max self.policy = policy # can be cached/refresh/offline/None (default) # Servers indicate how long they think their content is "valid". With # this parameter (force_min/max, expressed in seconds), we can override # the validity period (i.e. bypassing http headers) # Special choices, via "policy": # cached: use the cache no matter what (and fetch the page online if # not present in cache) # refresh: valid zero second, i.e. force refresh # offline: same as cached, i.e. use the cache no matter what, but do # NOT fetch the page online if not present in cache, throw an # error instead # None: just follow protocols # sanity checks assert self.force_max is None or self.force_max >= 0 assert self.force_min is None or self.force_min >= 0 assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min def load(self, url): try: data = pickle.loads(self.cache[url]) except KeyError: data = None else: data['headers'] = parse_headers(data['headers'] or unicode()) return data def save(self, key, data): data['headers'] = unicode(data['headers']) self.cache[key] = pickle.dumps(data, 0) def cached_response(self, req, fallback=None): req.from_morss_cache = True data = self.load(req.get_full_url()) if data is not None: # return the cache as a response resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code']) resp.msg = data['msg'] return resp else: return fallback def save_response(self, req, resp): if req.from_morss_cache: # do not re-save (would reset the timing) return resp data = resp.read() self.save(req.get_full_url(), { 'code': resp.code, 'msg': resp.msg, 'headers': resp.headers, 'data': data, 'timestamp': time.time() }) fp = BytesIO(data) old_resp = resp resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp def http_request(self, req): req.from_morss_cache = False # to track whether it comes from cache data = self.load(req.get_full_url()) if data is not None: if 'etag' in data['headers']: req.add_unredirected_header('If-None-Match', data['headers']['etag']) if 'last-modified' in data['headers']: req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified']) return req def http_open(self, req): # Reminder of how/when this function is called by urllib2: # If 'None' is returned, try your chance with the next-available handler # If a 'resp' is returned, stop there, and proceed with 'http_response' # Here, we try to see whether we want to use data from cache (i.e. # return 'resp'), or whether we want to refresh the content (return # 'None') data = self.load(req.get_full_url()) if data is not None: # some info needed to process everything cache_control = parse_http_list(data['headers'].get('cache-control', ())) cache_control += parse_http_list(data['headers'].get('pragma', ())) cc_list = [x for x in cache_control if '=' not in x] cc_values = parse_keqv_list([x for x in cache_control if '=' in x]) cache_age = time.time() - data['timestamp'] # list in a simple way what to do in special cases if data is not None and 'private' in cc_list and self.privacy == 'public': # private data but public cache, do not use cache # privacy concern, so handled first and foremost # (and doesn't need to be addressed anymore afterwards) return None elif self.policy == 'offline': # use cache, or return an error return self.cached_response( req, error_response(409, 'Conflict', req.get_full_url()) ) elif self.policy == 'cached': # use cache, or fetch online return self.cached_response(req, None) elif self.policy == 'refresh': # force refresh return None elif data is None: # we have already settled all the cases that don't need the cache. # all the following ones need the cached item return None elif self.force_max is not None and cache_age > self.force_max: # older than we want, refresh return None elif self.force_min is not None and cache_age < self.force_min: # recent enough, use cache return self.cached_response(req) elif data['code'] == 301 and cache_age < 7*24*3600: # "301 Moved Permanently" has to be cached...as long as we want # (awesome HTTP specs), let's say a week (why not?). Use force_min=0 # if you want to bypass this (needed for a proper refresh) return self.cached_response(req) elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list): # kindly follow web servers indications, refresh if the same # settings are used all along, this section shouldn't be of any use, # since the page woudln't be cached in the first place the check is # only performed "just in case" # NB. NOT respected if force_min is set return None elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age: # server says it's still fine (and we trust him, if not, use overrides), use cache return self.cached_response(req) else: # according to the www, we have to refresh when nothing is said return None def http_response(self, req, resp): # code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will) if resp.code == 304 and resp.url in self.cache: # we are hopefully the first after the HTTP handler, so no need # to re-run all the *_response # here: cached page, returning from cache return self.cached_response(req) elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers): cache_control = parse_http_list(resp.headers.get('cache-control', ())) cache_control += parse_http_list(resp.headers.get('pragma', ())) cc_list = [x for x in cache_control if '=' not in x] if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'): # kindly follow web servers indications (do not save & return) return resp else: # save return self.save_response(req, resp) else: return self.save_response(req, resp) https_request = http_request https_open = http_open https_response = http_response if 'IGNORE_SSL' in os.environ: import ssl ssl._create_default_https_context = ssl._create_unverified_context if __name__ == '__main__': req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') if sys.flags.interactive: print('>>> Interactive shell: try using `req`') else: print(req['data'].decode(req['encoding']))