morss/morss/crawler.py

688 lines
22 KiB
Python
Raw Permalink Normal View History

2020-08-26 18:08:22 +00:00
# This file is part of morss
#
# Copyright (C) 2013-2020 pictuga <contact@pictuga.com>
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License along
# with this program. If not, see <https://www.gnu.org/licenses/>.
2020-08-23 16:45:44 +00:00
import os
2021-09-11 11:10:42 +00:00
import pickle
2021-09-08 18:54:34 +00:00
import random
import re
import sys
2021-09-08 18:54:34 +00:00
import time
import zlib
2017-10-27 21:14:08 +00:00
from cgi import parse_header
from collections import OrderedDict
2021-09-08 18:54:34 +00:00
from io import BytesIO, StringIO
import chardet
2021-09-18 14:08:01 +00:00
from .caching import default_cache
2021-09-11 11:20:34 +00:00
try:
# python 2
from urllib import quote
2021-09-08 18:54:34 +00:00
from httplib import HTTPMessage
2021-09-11 09:34:16 +00:00
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list)
from urlparse import urlsplit
2015-02-25 10:07:09 +00:00
except ImportError:
# python 3
from email import message_from_string
from http.client import HTTPMessage
from urllib.parse import quote, urlsplit
2021-09-11 09:34:16 +00:00
from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl,
build_opener, parse_http_list, parse_keqv_list)
2015-02-25 16:50:23 +00:00
try:
# python 2
2015-02-25 16:50:23 +00:00
basestring
except NameError:
# python 3
basestring = unicode = str
2015-02-25 16:50:23 +00:00
MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml'],
'json': ['application/json'],
}
2020-04-24 09:28:39 +00:00
DEFAULT_UAS = [
#https://gist.github.com/fijimunkii/952acac988f2d25bef7e0284bc63c406
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
]
2020-04-28 20:03:49 +00:00
PROTOCOL = ['http', 'https']
2020-04-07 08:30:17 +00:00
def get(*args, **kwargs):
2020-04-28 20:29:07 +00:00
return adv_get(*args, **kwargs)['data']
2020-04-07 08:30:17 +00:00
2021-09-08 18:43:21 +00:00
def adv_get(url, post=None, timeout=None, *args, **kwargs):
2020-04-28 20:03:49 +00:00
url = sanitize_url(url)
2021-09-08 18:43:21 +00:00
if post is not None:
post = post.encode('utf-8')
2020-04-07 08:30:17 +00:00
if timeout is None:
2021-09-08 18:43:21 +00:00
con = custom_opener(*args, **kwargs).open(url, data=post)
2020-04-07 08:30:17 +00:00
else:
2021-09-08 18:43:21 +00:00
con = custom_opener(*args, **kwargs).open(url, data=post, timeout=timeout)
2020-04-07 08:30:17 +00:00
data = con.read()
contenttype = con.info().get('Content-Type', '').split(';')[0]
encoding= detect_encoding(data, con)
2020-04-28 20:29:07 +00:00
return {
2021-11-10 22:25:03 +00:00
'data': data,
2020-04-28 20:29:07 +00:00
'url': con.geturl(),
'con': con,
'contenttype': contenttype,
'encoding': encoding
}
2020-04-07 08:30:17 +00:00
2021-11-08 21:02:23 +00:00
def custom_opener(follow=None, policy=None, force_min=None, force_max=None):
# as per urllib2 source code, these Handelers are added first
# *unless* one of the custom handlers inherits from one of them
#
# [ProxyHandler, UnknownHandler, HTTPHandler,
# HTTPDefaultErrorHandler, HTTPRedirectHandler,
# FTPHandler, FileHandler, HTTPErrorProcessor]
# & HTTPSHandler
2021-03-25 22:49:58 +00:00
#
# when processing a request:
# (1) all the *_request are run
# (2) the *_open are run until sth is returned (other than None)
# (3) all the *_response are run
#
# During (3), if an http error occurs (i.e. not a 2XX response code), the
# http_error_* are run until sth is returned (other than None). If they all
# return nothing, a python error is raised
2022-01-08 21:24:56 +00:00
handlers = [
#DebugHandler(),
2022-01-19 12:41:12 +00:00
SizeLimitHandler(500*1024), # 500KiB
2022-01-08 21:24:56 +00:00
HTTPCookieProcessor(),
GZIPHandler(),
HTTPAllRedirectHandler(),
HTTPEquivHandler(),
HTTPRefreshHandler(),
UAHandler(random.choice(DEFAULT_UAS)),
BrowserlyHeaderHandler(),
EncodingFixHandler(),
]
2017-03-19 09:37:51 +00:00
if follow:
handlers.append(AlternateHandler(MIMETYPE[follow]))
2017-03-19 09:37:51 +00:00
2021-11-08 21:02:23 +00:00
handlers.append(CacheHandler(policy=policy, force_min=force_min, force_max=force_max))
return build_opener(*handlers)
def is_ascii(string):
# there's a native function in py3, but home-made fix for backward compatibility
try:
string.encode('ascii')
except UnicodeError:
return False
else:
return True
def soft_quote(string):
" url-quote only when not a valid ascii string "
if is_ascii(string):
return string
else:
return quote(string.encode('utf-8'))
2020-04-28 20:03:49 +00:00
def sanitize_url(url):
2020-05-02 17:18:01 +00:00
# make sure the url is unicode, i.e. not bytes
2020-04-28 20:03:49 +00:00
if isinstance(url, bytes):
url = url.decode('utf-8')
2020-04-28 20:03:49 +00:00
2020-05-02 17:18:01 +00:00
# make sure there's a protocol (http://)
2020-04-28 20:03:49 +00:00
if url.split(':', 1)[0] not in PROTOCOL:
url = 'http://' + url
# turns out some websites have really badly fomatted urls (fix http:/badurl)
url = re.sub('^(https?):/([^/])', r'\1://\2', url)
# escape spaces
url = url.replace(' ', '%20')
# escape non-ascii unicode characters
parts = urlsplit(url)
parts = parts._replace(
netloc=parts.netloc.replace(
parts.hostname,
parts.hostname.encode('idna').decode('ascii')
),
path=soft_quote(parts.path),
query=soft_quote(parts.query),
fragment=soft_quote(parts.fragment),
)
return parts.geturl()
2020-10-30 21:15:35 +00:00
class RespDataHandler(BaseHandler):
" Make it easier to use the reponse body "
def data_reponse(self, req, resp, data):
pass
def http_response(self, req, resp):
# read data
data = resp.read()
# process data and use returned content (if any)
data = self.data_response(req, resp, data) or data
# reformat the stuff
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class RespStrHandler(RespDataHandler):
" Make it easier to use the _decoded_ reponse body "
def str_reponse(self, req, resp, data_str):
pass
def data_response(self, req, resp, data):
#decode
enc = detect_encoding(data, resp)
data_str = data.decode(enc, 'replace')
#process
data_str = self.str_response(req, resp, data_str)
# return
data = data_str.encode(enc) if data_str is not None else data
#return
return data
class DebugHandler(BaseHandler):
handler_order = 2000
def http_request(self, req):
print(repr(req.header_items()))
return req
def http_response(self, req, resp):
print(resp.headers.__dict__)
return resp
https_request = http_request
https_response = http_response
class SizeLimitHandler(BaseHandler):
""" Limit file size, defaults to 5MiB """
handler_order = 450
2020-10-30 21:12:43 +00:00
def __init__(self, limit=5*1024**2):
self.limit = limit
def http_response(self, req, resp):
data = resp.read(self.limit)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
def UnGzip(data):
" Supports truncated files "
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
class GZIPHandler(RespDataHandler):
def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip')
return req
def data_response(self, req, resp, data):
if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip':
resp.headers['Content-Encoding'] = 'identity'
return UnGzip(data)
2017-10-27 21:14:08 +00:00
def detect_encoding(data, resp=None):
2020-04-20 14:14:55 +00:00
enc = detect_raw_encoding(data, resp)
2020-05-27 19:34:43 +00:00
if enc.lower() == 'gb2312':
2020-04-20 14:14:55 +00:00
enc = 'gbk'
return enc
def detect_raw_encoding(data, resp=None):
2017-10-27 21:14:08 +00:00
if resp is not None:
enc = resp.headers.get('charset')
if enc is not None:
return enc
enc = parse_header(resp.headers.get('content-type', ''))[1].get('charset')
if enc is not None:
return enc
2015-03-10 17:05:02 +00:00
match = re.search(b'charset=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
2015-03-10 17:05:02 +00:00
return match.groups()[0].lower().decode()
match = re.search(b'encoding=["\']?([0-9a-zA-Z-]+)', data[:1000])
if match:
2015-03-10 17:05:02 +00:00
return match.groups()[0].lower().decode()
enc = chardet.detect(data[-2000:])['encoding']
if enc and enc != 'ascii':
return enc
return 'utf-8'
class EncodingFixHandler(RespStrHandler):
def str_response(self, req, resp, data_str):
return data_str
class UAHandler(BaseHandler):
def __init__(self, useragent=None):
self.useragent = useragent
def http_request(self, req):
if self.useragent:
req.add_unredirected_header('User-Agent', self.useragent)
return req
https_request = http_request
2020-04-05 19:11:57 +00:00
class BrowserlyHeaderHandler(BaseHandler):
""" Add more headers to look less suspicious """
def http_request(self, req):
2020-04-05 19:11:57 +00:00
req.add_unredirected_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_unredirected_header('Accept-Language', 'en-US,en;q=0.5')
return req
https_request = http_request
def iter_html_tag(html_str, tag_name):
2021-08-28 22:18:50 +00:00
" To avoid parsing whole pages when looking for a simple tag "
re_tag = r'<%s\s+[^>]+>' % tag_name
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
for tag_match in re.finditer(re_tag, html_str):
attr_match = re.findall(re_attr, tag_match.group(0))
if attr_match is not None:
yield dict(attr_match)
class AlternateHandler(RespStrHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, follow=None):
self.follow = follow or []
def str_response(self, req, resp, data_str):
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
for link in iter_html_tag(data_str[:10000], 'link'):
if (link.get('rel') == 'alternate'
and link.get('type') in self.follow
and 'href' in link):
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
break
class HTTPEquivHandler(RespStrHandler):
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
handler_order = 600
def str_response(self, req, resp, data_str):
2015-03-10 17:03:16 +00:00
contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
2020-07-06 10:25:38 +00:00
for meta in iter_html_tag(data_str[:10000], 'meta'):
if 'http-equiv' in meta and 'content' in meta:
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
2021-09-11 09:34:16 +00:00
class HTTPAllRedirectHandler(HTTPRedirectHandler):
def http_error_308(self, req, fp, code, msg, headers):
return self.http_error_301(req, fp, 301, msg, headers)
class HTTPRefreshHandler(BaseHandler):
handler_order = 700 # HTTPErrorProcessor has a handler_order of 1000
def http_response(self, req, resp):
if 200 <= resp.code < 300:
if resp.headers.get('refresh'):
2022-02-01 22:32:49 +00:00
regex = r'(?i)^(?P<delay>[0-9]+)\s*;\s*url\s*=\s*(["\']?)(?P<url>.+)\2$'
match = re.search(regex, resp.headers.get('refresh'))
if match:
url = match.groupdict()['url']
if url:
resp.code = 302
resp.msg = 'Moved Temporarily'
resp.headers['location'] = url
return resp
https_response = http_response
2021-11-10 22:25:03 +00:00
def parse_headers(text=u'\n\n'):
if sys.version_info[0] >= 3:
# python 3
2021-11-11 09:21:48 +00:00
return message_from_string(text, _class=HTTPMessage)
else:
# python 2
return HTTPMessage(StringIO(text))
2021-11-08 21:02:23 +00:00
def error_response(code, msg, url=''):
# return an error as a response
resp = addinfourl(BytesIO(), parse_headers(), url, code)
2021-11-08 21:02:23 +00:00
resp.msg = msg
return resp
class CacheHandler(BaseHandler):
" Cache based on etags/last-modified "
2021-11-08 21:02:23 +00:00
privacy = 'private' # Websites can indicate whether the page should be cached
# by CDNs (e.g. shouldn't be the case for
# private/confidential/user-specific pages. With this
# setting, decide whether you want the cache to behave
# like a CDN (i.e. don't cache private pages, 'public'),
# or to behave like a end-user private pages
# ('private'). If unsure, 'public' is the safest bet,
# but many websites abuse this feature...
# NB. This overrides all the other min/max/policy settings.
handler_order = 499
2021-11-08 21:02:23 +00:00
def __init__(self, cache=None, force_min=None, force_max=None, policy=None):
self.cache = cache or default_cache
2020-05-12 18:44:25 +00:00
self.force_min = force_min
2021-11-08 21:02:23 +00:00
self.force_max = force_max
self.policy = policy # can be cached/refresh/offline/None (default)
# Servers indicate how long they think their content is "valid". With
# this parameter (force_min/max, expressed in seconds), we can override
# the validity period (i.e. bypassing http headers)
# Special choices, via "policy":
# cached: use the cache no matter what (and fetch the page online if
# not present in cache)
# refresh: valid zero second, i.e. force refresh
# offline: same as cached, i.e. use the cache no matter what, but do
# NOT fetch the page online if not present in cache, throw an
# error instead
# None: just follow protocols
# sanity checks
assert self.force_max is None or self.force_max >= 0
assert self.force_min is None or self.force_min >= 0
assert self.force_max is None or self.force_min is None or self.force_max >= self.force_min
def load(self, url):
try:
2021-09-11 11:10:42 +00:00
data = pickle.loads(self.cache[url])
except KeyError:
2021-09-11 11:10:42 +00:00
data = None
else:
data['headers'] = parse_headers(data['headers'] or unicode())
2021-09-11 11:10:42 +00:00
return data
2021-09-11 11:10:42 +00:00
def save(self, key, data):
data['headers'] = unicode(data['headers'])
self.cache[key] = pickle.dumps(data, 0)
2021-11-08 21:02:23 +00:00
def cached_response(self, req, fallback=None):
req.from_morss_cache = True
2021-09-11 11:10:42 +00:00
data = self.load(req.get_full_url())
2021-03-25 22:54:08 +00:00
2021-11-08 21:02:23 +00:00
if data is not None:
# return the cache as a response
resp = addinfourl(BytesIO(data['data']), data['headers'], req.get_full_url(), data['code'])
resp.msg = data['msg']
return resp
2021-03-25 22:54:08 +00:00
2021-11-08 21:02:23 +00:00
else:
return fallback
2021-03-25 22:54:08 +00:00
def save_response(self, req, resp):
if req.from_morss_cache:
# do not re-save (would reset the timing)
return resp
2021-03-25 22:54:08 +00:00
data = resp.read()
2021-09-11 11:10:42 +00:00
self.save(req.get_full_url(), {
'code': resp.code,
'msg': resp.msg,
'headers': resp.headers,
2021-09-11 11:10:42 +00:00
'data': data,
'timestamp': time.time()
})
2021-03-25 22:54:08 +00:00
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
def http_request(self, req):
req.from_morss_cache = False # to track whether it comes from cache
2021-09-11 11:10:42 +00:00
data = self.load(req.get_full_url())
2021-09-11 11:10:42 +00:00
if data is not None:
if 'etag' in data['headers']:
req.add_unredirected_header('If-None-Match', data['headers']['etag'])
2021-09-11 11:10:42 +00:00
if 'last-modified' in data['headers']:
req.add_unredirected_header('If-Modified-Since', data['headers']['last-modified'])
return req
def http_open(self, req):
2020-05-12 18:44:25 +00:00
# Reminder of how/when this function is called by urllib2:
# If 'None' is returned, try your chance with the next-available handler
# If a 'resp' is returned, stop there, and proceed with 'http_response'
2021-11-08 21:02:23 +00:00
# Here, we try to see whether we want to use data from cache (i.e.
# return 'resp'), or whether we want to refresh the content (return
# 'None')
2021-09-11 11:10:42 +00:00
data = self.load(req.get_full_url())
2021-11-08 21:02:23 +00:00
if data is not None:
# some info needed to process everything
cache_control = parse_http_list(data['headers'].get('cache-control', ()))
cache_control += parse_http_list(data['headers'].get('pragma', ()))
2021-11-08 21:02:23 +00:00
cc_list = [x for x in cache_control if '=' not in x]
cc_values = parse_keqv_list([x for x in cache_control if '=' in x])
2021-11-08 21:02:23 +00:00
cache_age = time.time() - data['timestamp']
2021-11-08 21:02:23 +00:00
# list in a simple way what to do in special cases
2021-11-08 21:02:23 +00:00
if data is not None and 'private' in cc_list and self.privacy == 'public':
# private data but public cache, do not use cache
# privacy concern, so handled first and foremost
# (and doesn't need to be addressed anymore afterwards)
return None
2021-11-08 21:02:23 +00:00
elif self.policy == 'offline':
# use cache, or return an error
return self.cached_response(
req,
error_response(409, 'Conflict', req.get_full_url())
)
2021-11-08 21:02:23 +00:00
elif self.policy == 'cached':
# use cache, or fetch online
return self.cached_response(req, None)
2021-11-08 21:02:23 +00:00
elif self.policy == 'refresh':
# force refresh
return None
2021-11-08 21:02:23 +00:00
elif data is None:
# we have already settled all the cases that don't need the cache.
# all the following ones need the cached item
return None
elif self.force_max is not None and cache_age > self.force_max:
# older than we want, refresh
return None
elif self.force_min is not None and cache_age < self.force_min:
# recent enough, use cache
return self.cached_response(req)
2021-09-11 11:10:42 +00:00
elif data['code'] == 301 and cache_age < 7*24*3600:
2020-05-12 18:44:25 +00:00
# "301 Moved Permanently" has to be cached...as long as we want
# (awesome HTTP specs), let's say a week (why not?). Use force_min=0
# if you want to bypass this (needed for a proper refresh)
2021-03-25 22:54:08 +00:00
return self.cached_response(req)
2021-11-08 21:02:23 +00:00
elif self.force_min is None and ('no-cache' in cc_list or 'no-store' in cc_list):
# kindly follow web servers indications, refresh if the same
# settings are used all along, this section shouldn't be of any use,
# since the page woudln't be cached in the first place the check is
# only performed "just in case"
# NB. NOT respected if force_min is set
return None
elif 'max-age' in cc_values and int(cc_values['max-age']) > cache_age:
2021-11-08 21:02:23 +00:00
# server says it's still fine (and we trust him, if not, use overrides), use cache
2021-03-25 22:54:08 +00:00
return self.cached_response(req)
else:
# according to the www, we have to refresh when nothing is said
return None
def http_response(self, req, resp):
# code for after-fetch, to know whether to save to hard-drive (if sticking to http headers' will)
2021-11-08 21:02:23 +00:00
if resp.code == 304 and resp.url in self.cache:
2021-03-25 22:54:08 +00:00
# we are hopefully the first after the HTTP handler, so no need
# to re-run all the *_response
# here: cached page, returning from cache
return self.cached_response(req)
2021-11-08 21:02:23 +00:00
elif self.force_min is None and ('cache-control' in resp.headers or 'pragma' in resp.headers):
cache_control = parse_http_list(resp.headers.get('cache-control', ()))
cache_control += parse_http_list(resp.headers.get('pragma', ()))
cc_list = [x for x in cache_control if '=' not in x]
2021-11-08 21:02:23 +00:00
if 'no-cache' in cc_list or 'no-store' in cc_list or ('private' in cc_list and self.privacy == 'public'):
2021-03-25 22:54:08 +00:00
# kindly follow web servers indications (do not save & return)
return resp
2021-03-25 22:54:08 +00:00
else:
# save
return self.save_response(req, resp)
2021-03-25 22:54:08 +00:00
else:
return self.save_response(req, resp)
https_request = http_request
https_open = http_open
https_response = http_response
2020-10-03 17:57:08 +00:00
if 'IGNORE_SSL' in os.environ:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
if __name__ == '__main__':
2020-04-28 20:29:07 +00:00
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
2020-05-26 17:34:20 +00:00
if sys.flags.interactive:
print('>>> Interactive shell: try using `req`')
else:
2020-04-28 20:29:07 +00:00
print(req['data'].decode(req['encoding']))