Compare commits
6 Commits
fb19b1241f
...
01a7667032
Author | SHA1 | Date |
---|---|---|
pictuga | 01a7667032 | |
pictuga | 3e886caaab | |
pictuga | ad927e03a7 | |
pictuga | 0efb096fa7 | |
pictuga | 9ab2e488ef | |
pictuga | b525ab0d26 |
169
morss/crawler.py
169
morss/crawler.py
|
@ -23,7 +23,6 @@ from io import BytesIO, StringIO
|
||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
from cgi import parse_header
|
from cgi import parse_header
|
||||||
import lxml.html
|
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
import random
|
import random
|
||||||
|
@ -105,7 +104,7 @@ def adv_get(url, timeout=None, *args, **kwargs):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def custom_handler(follow=None, delay=None, encoding=None):
|
def custom_handler(follow=None, delay=None):
|
||||||
handlers = []
|
handlers = []
|
||||||
|
|
||||||
# as per urllib2 source code, these Handelers are added first
|
# as per urllib2 source code, these Handelers are added first
|
||||||
|
@ -124,7 +123,7 @@ def custom_handler(follow=None, delay=None, encoding=None):
|
||||||
handlers.append(HTTPRefreshHandler())
|
handlers.append(HTTPRefreshHandler())
|
||||||
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
handlers.append(UAHandler(random.choice(DEFAULT_UAS)))
|
||||||
handlers.append(BrowserlyHeaderHandler())
|
handlers.append(BrowserlyHeaderHandler())
|
||||||
handlers.append(EncodingFixHandler(encoding))
|
handlers.append(EncodingFixHandler())
|
||||||
|
|
||||||
if follow:
|
if follow:
|
||||||
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
handlers.append(AlternateHandler(MIMETYPE[follow]))
|
||||||
|
@ -176,6 +175,51 @@ def sanitize_url(url):
|
||||||
return urlunparse(parts)
|
return urlunparse(parts)
|
||||||
|
|
||||||
|
|
||||||
|
class RespDataHandler(BaseHandler):
|
||||||
|
" Make it easier to use the reponse body "
|
||||||
|
|
||||||
|
def data_reponse(self, req, resp, data):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def http_response(self, req, resp):
|
||||||
|
# read data
|
||||||
|
data = resp.read()
|
||||||
|
|
||||||
|
# process data and use returned content (if any)
|
||||||
|
data = self.data_response(req, resp, data) or data
|
||||||
|
|
||||||
|
# reformat the stuff
|
||||||
|
fp = BytesIO(data)
|
||||||
|
old_resp = resp
|
||||||
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
|
class RespStrHandler(RespDataHandler):
|
||||||
|
" Make it easier to use the _decoded_ reponse body "
|
||||||
|
|
||||||
|
def str_reponse(self, req, resp, data_str):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def data_response(self, req, resp, data):
|
||||||
|
#decode
|
||||||
|
enc = detect_encoding(data, resp)
|
||||||
|
data_str = data.decode(enc, 'replace')
|
||||||
|
|
||||||
|
#process
|
||||||
|
data_str = self.str_response(req, resp, data_str)
|
||||||
|
|
||||||
|
# return
|
||||||
|
data = data_str.encode(enc) if data_str is not None else data
|
||||||
|
|
||||||
|
#return
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
class DebugHandler(BaseHandler):
|
class DebugHandler(BaseHandler):
|
||||||
handler_order = 2000
|
handler_order = 2000
|
||||||
|
|
||||||
|
@ -196,7 +240,7 @@ class SizeLimitHandler(BaseHandler):
|
||||||
|
|
||||||
handler_order = 450
|
handler_order = 450
|
||||||
|
|
||||||
def __init__(self, limit=5*1024^2):
|
def __init__(self, limit=5*1024**2):
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def http_response(self, req, resp):
|
||||||
|
@ -217,29 +261,17 @@ def UnGzip(data):
|
||||||
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
return zlib.decompressobj(zlib.MAX_WBITS | 32).decompress(data)
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(BaseHandler):
|
class GZIPHandler(RespDataHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
return req
|
return req
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def data_response(self, req, resp, data):
|
||||||
if 200 <= resp.code < 300:
|
if 200 <= resp.code < 300:
|
||||||
if resp.headers.get('Content-Encoding') == 'gzip':
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
data = UnGzip(data)
|
|
||||||
|
|
||||||
resp.headers['Content-Encoding'] = 'identity'
|
resp.headers['Content-Encoding'] = 'identity'
|
||||||
|
|
||||||
fp = BytesIO(data)
|
return UnGzip(data)
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
https_request = http_request
|
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(data, resp=None):
|
def detect_encoding(data, resp=None):
|
||||||
|
@ -276,28 +308,9 @@ def detect_raw_encoding(data, resp=None):
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
|
||||||
|
|
||||||
class EncodingFixHandler(BaseHandler):
|
class EncodingFixHandler(RespStrHandler):
|
||||||
def __init__(self, encoding=None):
|
def str_response(self, req, resp, data_str):
|
||||||
self.encoding = encoding
|
return data_str
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
|
||||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
|
||||||
if 200 <= resp.code < 300 and maintype == 'text':
|
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
enc = self.encoding or detect_encoding(data, resp)
|
|
||||||
|
|
||||||
data = data.decode(enc, 'replace')
|
|
||||||
data = data.encode(enc)
|
|
||||||
|
|
||||||
fp = BytesIO(data)
|
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
|
|
||||||
|
|
||||||
class UAHandler(BaseHandler):
|
class UAHandler(BaseHandler):
|
||||||
|
@ -323,71 +336,51 @@ class BrowserlyHeaderHandler(BaseHandler):
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class AlternateHandler(BaseHandler):
|
def iter_html_tag(html_str, tag_name):
|
||||||
|
re_tag = r'<%s(\s*[^>])*>' % tag_name
|
||||||
|
re_attr = r'(?P<key>[^=\s]+)=[\'"](?P<value>[^\'"]+)[\'"]'
|
||||||
|
|
||||||
|
for tag_match in re.finditer(re_tag, html_str):
|
||||||
|
attr_match = re.findall(re_attr, tag_match.group(0))
|
||||||
|
|
||||||
|
if attr_match is not None:
|
||||||
|
yield dict(attr_match)
|
||||||
|
|
||||||
|
|
||||||
|
class AlternateHandler(RespStrHandler):
|
||||||
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||||
|
|
||||||
def __init__(self, follow=None):
|
def __init__(self, follow=None):
|
||||||
self.follow = follow or []
|
self.follow = follow or []
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def str_response(self, req, resp, data_str):
|
||||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
|
|
||||||
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow:
|
||||||
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||||
|
|
||||||
data = resp.read()
|
for link in iter_html_tag(data_str[:10000], 'link'):
|
||||||
|
if (link.get('rel') == 'alternate'
|
||||||
try:
|
and link.get('type') in self.follow
|
||||||
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
and 'href' in link):
|
||||||
|
resp.code = 302
|
||||||
for link in links:
|
resp.msg = 'Moved Temporarily'
|
||||||
if link.get('type', '') in self.follow:
|
resp.headers['location'] = link.get('href')
|
||||||
resp.code = 302
|
break
|
||||||
resp.msg = 'Moved Temporarily'
|
|
||||||
resp.headers['location'] = link.get('href')
|
|
||||||
break
|
|
||||||
|
|
||||||
except (ValueError, SyntaxError):
|
|
||||||
# catch parsing errors
|
|
||||||
pass
|
|
||||||
|
|
||||||
fp = BytesIO(data)
|
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
|
|
||||||
|
|
||||||
class HTTPEquivHandler(BaseHandler):
|
class HTTPEquivHandler(RespStrHandler):
|
||||||
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
" Handler to support <meta http-equiv='...' content='...' />, since it defines HTTP headers "
|
||||||
|
|
||||||
handler_order = 600
|
handler_order = 600
|
||||||
|
|
||||||
def http_response(self, req, resp):
|
def str_response(self, req, resp, data_str):
|
||||||
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
if 200 <= resp.code < 300 and contenttype in MIMETYPE['html']:
|
||||||
data = resp.read()
|
|
||||||
|
|
||||||
try:
|
for meta in iter_html_tag(data_str[:10000], 'meta'):
|
||||||
headers = lxml.html.fromstring(data[:10000]).findall('.//meta[@http-equiv]')
|
if 'http-equiv' in meta and 'content' in meta:
|
||||||
|
resp.headers[meta.get('http-equiv').lower()] = meta.get('content')
|
||||||
for header in headers:
|
|
||||||
resp.headers[header.get('http-equiv').lower()] = header.get('content')
|
|
||||||
|
|
||||||
except (ValueError, SyntaxError):
|
|
||||||
# catch parsing errors
|
|
||||||
pass
|
|
||||||
|
|
||||||
fp = BytesIO(data)
|
|
||||||
old_resp = resp
|
|
||||||
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
|
||||||
resp.msg = old_resp.msg
|
|
||||||
|
|
||||||
return resp
|
|
||||||
|
|
||||||
https_response = http_response
|
|
||||||
|
|
||||||
|
|
||||||
class HTTPRefreshHandler(BaseHandler):
|
class HTTPRefreshHandler(BaseHandler):
|
||||||
|
|
|
@ -257,7 +257,7 @@ def cgi_error_handler(environ, start_response, app):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
headers = {'status': '500 Oops', 'content-type': 'text/html'}
|
||||||
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
start_response(headers['status'], list(headers.items()), sys.exc_info())
|
||||||
log('ERROR: %s' % repr(e), force=True)
|
log('ERROR: %s' % repr(e))
|
||||||
return [cgitb.html(sys.exc_info())]
|
return [cgitb.html(sys.exc_info())]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue