Move custom_handler to crawler
Makes more sense. Easier to reuse. Also cleaned up a bit the codemaster
parent
beec6469cc
commit
2003e2760b
|
@ -29,6 +29,28 @@ MIMETYPE = {
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
|
|
||||||
|
def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
|
||||||
|
handlers = []
|
||||||
|
|
||||||
|
handlers.append(GZIPHandler())
|
||||||
|
handlers.append(HTTPEquivHandler())
|
||||||
|
handlers.append(HTTPRefreshHandler())
|
||||||
|
|
||||||
|
if not basic:
|
||||||
|
handlers.append(UAHandler(DEFAULT_UA))
|
||||||
|
handlers.append(AutoRefererHandler())
|
||||||
|
|
||||||
|
handlers.append(EncodingFixHandler(encoding))
|
||||||
|
handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
|
||||||
|
handlers.append(SQliteCacheHandler(delay))
|
||||||
|
|
||||||
|
return build_opener(*handlers)
|
||||||
|
|
||||||
|
|
||||||
class GZIPHandler(BaseHandler):
|
class GZIPHandler(BaseHandler):
|
||||||
def http_request(self, req):
|
def http_request(self, req):
|
||||||
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
req.add_unredirected_header('Accept-Encoding', 'gzip')
|
||||||
|
|
|
@ -48,8 +48,6 @@ THREADS = 10 # number of threads (1 for single-threaded)
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
PORT = 8080
|
PORT = 8080
|
||||||
|
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https', 'ftp']
|
||||||
|
|
||||||
|
|
||||||
|
@ -127,19 +125,6 @@ def parseOptions(options):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
|
||||||
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
|
|
||||||
crawler.HTTPRefreshHandler()]
|
|
||||||
|
|
||||||
def custom_handler(accept, strict=False, delay=DELAY, encoding=None):
|
|
||||||
handlers = default_handlers[:]
|
|
||||||
handlers.append(crawler.EncodingFixHandler(encoding))
|
|
||||||
handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept], strict))
|
|
||||||
handlers.append(crawler.SQliteCacheHandler(delay))
|
|
||||||
|
|
||||||
return build_opener(*handlers)
|
|
||||||
|
|
||||||
|
|
||||||
def ItemFix(item, feedurl='/'):
|
def ItemFix(item, feedurl='/'):
|
||||||
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
""" Improves feed items (absolute links, resolve feedburner links, etc) """
|
||||||
|
|
||||||
|
@ -267,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
|
con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
|
||||||
data = con.read()
|
data = con.read()
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
|
@ -368,7 +353,7 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
|
con = crawler.custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
|
|
||||||
except (HTTPError) as e:
|
except (HTTPError) as e:
|
||||||
|
|
Loading…
Reference in New Issue