Compare commits

...

6 Commits

Author SHA1 Message Date
pictuga 01a7667032 Fix error due to remaining log force code 2021-01-14 00:51:47 +01:00
pictuga 3e886caaab crawler: drop encoding setting 2020-10-30 22:41:16 +01:00
pictuga ad927e03a7 crawler: use regex instead of lxml
Less reliable but should be faster
2020-10-30 22:21:19 +01:00
pictuga 0efb096fa7 crawler: shift gzip & encoding-fix to intermediary handler 2020-10-30 22:16:51 +01:00
pictuga 9ab2e488ef crawler: add intermediary handlers 2020-10-30 22:15:35 +01:00
pictuga b525ab0d26 crawler: fix typo 2020-10-30 22:12:43 +01:00
2 changed files with 82 additions and 89 deletions

View File

@ -23,7 +23,6 @@ from io import BytesIO, StringIO
import re import re
import chardet import chardet
from cgi import parse_header from cgi import parse_header
import lxml.html
import time import time
import threading import threading
import random import random
@ -105,7 +104,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
} }
def custom_handler(follow=None, delay=None, encoding=None): def custom_handler(follow=None, delay=None):
handlers = [] handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
@ -124,7 +123,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(random.choice(DEFAULT_UAS))) handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
handlers.append(BrowserlyHeaderHandler()) handlers.append(BrowserlyHeaderHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler())
if follow: if follow:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(AlternateHandler(MIMETYPE[follow]))
@ -176,6 +175,51 @@ def sanitize_url(url):
return urlunparse(parts) return urlunparse(parts)
class RespDataHandler(BaseHandler):
" Make it easier to use the reponse body "
def data_reponse(self, req, resp, data):
pass
def http_response(self, req, resp):
# read data
data = resp.read()
# process data and use returned content (if any)
data = self.data_response(req, resp, data) or data
# reformat the stuff
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class RespStrHandler(RespDataHandler):
" Make it easier to use the _decoded_ reponse body "
def str_reponse(self, req, resp, data_str):
pass
def data_response(self, req, resp, data):
#decode
enc = detect_encoding(data, resp)
data_str = data.decode(enc, 'replace')
#process
data_str = self.str_response(req, resp, data_str)
# return
data = data_str.encode(enc) if data_str is not None else data
#return
return data
class DebugHandler(BaseHandler): class DebugHandler(BaseHandler):
handler_order = 2000 handler_order = 2000
@ -196,7 +240,7 @@ class SizeLimitHandler(BaseHandler):
handler_order = 450 handler_order = 450
def __init__(self, limit=5*1024^2): def __init__(self, limit=5*1024**2):
self.limit = limit self.limit = limit
def http_response(self, req, resp): def http_response(self, req, resp):
@ -217,29 +261,17 @@ def UnGzip(data):
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data) return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
class GZIPHandler(BaseHandler): class GZIPHandler(RespDataHandler):
def http_request(self, req): def http_request(self, req):
req.add_unredirected_header('Accept-Encoding', 'gzip') req.add_unredirected_header('Accept-Encoding', 'gzip')
return req return req
def http_response(self, req, resp): def data_response(self, req, resp, data):
if 200 <= resp.code < 300: if 200 <= resp.code < 300:
if resp.headers.get('Content-Encoding') == 'gzip': if resp.headers.get('Content-Encoding') == 'gzip':
data = resp.read()
data = UnGzip(data)
resp.headers['Content-Encoding'] = 'identity' resp.headers['Content-Encoding'] = 'identity'
fp = BytesIO(data) return UnGzip(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def detect_encoding(data, resp=None): def detect_encoding(data, resp=None):
@ -276,28 +308,9 @@ def detect_raw_encoding(data, resp=None):
return 'utf-8' return 'utf-8'
class EncodingFixHandler(BaseHandler): class EncodingFixHandler(RespStrHandler):
def __init__(self, encoding=None): def str_response(self, req, resp, data_str):
self.encoding = encoding return data_str
def http_response(self, req, resp):
maintype = resp.info().get('Content-Type', '').split('/')[0]
if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read()
enc = self.encoding or detect_encoding(data, resp)
data = data.decode(enc, 'replace')
data = data.encode(enc)
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class UAHandler(BaseHandler): class UAHandler(BaseHandler):
@ -323,71 +336,51 @@ class BrowserlyHeaderHandler(BaseHandler):
https_request = http_request https_request = http_request
class AlternateHandler(BaseHandler): def iter_html_tag(html_str, tag_name):
re_tag = r'<%s(\s*[^>])*>' % tag_name
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
for tag_match in re.finditer(re_tag, html_str):
attr_match = re.findall(re_attr, tag_match.group(0))
if attr_match is not None:
yield dict(attr_match)
class AlternateHandler(RespStrHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> " " Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, follow=None): def __init__(self, follow=None):
self.follow = follow or [] self.follow = follow or []
def http_response(self, req, resp): def str_response(self, req, resp, data_str):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow: if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read() for link in iter_html_tag(data_str[:10000], 'link'):
if (link.get('rel') == 'alternate'
try: and link.get('type') in self.follow
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') and 'href' in link):
resp.code = 302
for link in links: resp.msg = 'Moved Temporarily'
if link.get('type', '') in self.follow: resp.headers['location'] = link.get('href')
resp.code = 302 break
resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href')
break
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPEquivHandler(BaseHandler): class HTTPEquivHandler(RespStrHandler):
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers " " Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
handler_order = 600 handler_order = 600
def http_response(self, req, resp): def str_response(self, req, resp, data_str):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']: if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
data = resp.read()
try: for meta in iter_html_tag(data_str[:10000], 'meta'):
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]') if 'http-equiv' in meta and 'content' in meta:
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
for header in headers:
resp.headers[header.get('http-equiv').lower()] = header.get('content')
except (ValueError, SyntaxError):
# catch parsing errors
pass
fp = BytesIO(data)
old_resp = resp
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
class HTTPRefreshHandler(BaseHandler): class HTTPRefreshHandler(BaseHandler):

View File

@ -257,7 +257,7 @@ def cgi_error_handler(environ, start_response, app):
except Exception as e: except Exception as e:
headers = {'status': '500 Oops', 'content-type': 'text/html'} headers = {'status': '500 Oops', 'content-type': 'text/html'}
start_response(headers['status'], list(headers.items()), sys.exc_info()) start_response(headers['status'], list(headers.items()), sys.exc_info())
log('ERROR: %s' % repr(e), force=True) log('ERROR: %s' % repr(e))
return [cgitb.html(sys.exc_info())] return [cgitb.html(sys.exc_info())]