crawler: fix regression brought with 44a6b2591
continuous-integration/drone/push Build is passing Details

master
pictuga 2021-11-10 23:08:31 +01:00
parent cb21871c35
commit e42df98f83
1 changed files with 16 additions and 6 deletions

View File

@ -19,6 +19,7 @@ import os
import pickle import pickle
import random import random
import re import re
import sys
import time import time
import zlib import zlib
from cgi import parse_header from cgi import parse_header
@ -33,7 +34,7 @@ try:
# python 2 # python 2
from urllib import quote from urllib import quote
from mimetools import Message as message_from_string from httplib import HTTPMessage
from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler, from urllib2 import (BaseHandler, HTTPCookieProcessor, HTTPRedirectHandler,
Request, addinfourl, build_opener, parse_http_list, Request, addinfourl, build_opener, parse_http_list,
parse_keqv_list) parse_keqv_list)
@ -41,6 +42,7 @@ try:
except ImportError: except ImportError:
# python 3 # python 3
from email import message_from_string from email import message_from_string
from http.client import HTTPMessage
from urllib.parse import quote, urlparse, urlunparse from urllib.parse import quote, urlparse, urlunparse
from urllib.request import (BaseHandler, HTTPCookieProcessor, from urllib.request import (BaseHandler, HTTPCookieProcessor,
HTTPRedirectHandler, Request, addinfourl, HTTPRedirectHandler, Request, addinfourl,
@ -426,9 +428,19 @@ class HTTPRefreshHandler(BaseHandler):
https_response = http_response https_response = http_response
def parse_headers(text=u'\n\''):
if sys.version_info[0] >= 3:
# python 3
return message_from_string(text)
else:
# python 2
return HTTPMessage(StringIO(text))
def error_response(code, msg, url=''): def error_response(code, msg, url=''):
# return an error as a response # return an error as a response
resp = addinfourl(BytesIO(), message_from_string('\n\n'), url, code) resp = addinfourl(BytesIO(), parse_headers(), url, code)
resp.msg = msg resp.msg = msg
return resp return resp
@ -479,7 +491,7 @@ class CacheHandler(BaseHandler):
data = None data = None
else: else:
data['headers'] = message_from_string(data['headers'] or unicode()) # headers data['headers'] = parse_headers(data['headers'] or unicode())
return data return data
@ -505,7 +517,7 @@ class CacheHandler(BaseHandler):
self.save(req.get_full_url(), { self.save(req.get_full_url(), {
'code': resp.code, 'code': resp.code,
'msg': resp.msg, 'msg': resp.msg,
'headers': str(resp.headers), 'headers': resp.headers,
'data': data, 'data': data,
'timestamp': time.time() 'timestamp': time.time()
}) })
@ -646,8 +658,6 @@ if 'IGNORE_SSL' in os.environ:
if __name__ == '__main__': if __name__ == '__main__':
import sys
req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it') req = adv_get(sys.argv[1] if len(sys.argv) > 1 else 'https://morss.it')
if sys.flags.interactive: if sys.flags.interactive: