Move custom_handler to crawler
Makes more sense. Easier to reuse. Also cleaned up a bit the code
This commit is contained in:
		@@ -29,6 +29,28 @@ MIMETYPE = {
 | 
				
			|||||||
    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
 | 
					    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
 | 
				
			||||||
    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
 | 
					    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
 | 
				
			||||||
 | 
					    handlers = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    handlers.append(GZIPHandler())
 | 
				
			||||||
 | 
					    handlers.append(HTTPEquivHandler())
 | 
				
			||||||
 | 
					    handlers.append(HTTPRefreshHandler())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not basic:
 | 
				
			||||||
 | 
					        handlers.append(UAHandler(DEFAULT_UA))
 | 
				
			||||||
 | 
					        handlers.append(AutoRefererHandler())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    handlers.append(EncodingFixHandler(encoding))
 | 
				
			||||||
 | 
					    handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
 | 
				
			||||||
 | 
					    handlers.append(SQliteCacheHandler(delay))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return build_opener(*handlers)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GZIPHandler(BaseHandler):
 | 
					class GZIPHandler(BaseHandler):
 | 
				
			||||||
    def http_request(self, req):
 | 
					    def http_request(self, req):
 | 
				
			||||||
        req.add_unredirected_header('Accept-Encoding', 'gzip')
 | 
					        req.add_unredirected_header('Accept-Encoding', 'gzip')
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -48,8 +48,6 @@ THREADS = 10  # number of threads (1 for single-threaded)
 | 
				
			|||||||
DEBUG = False
 | 
					DEBUG = False
 | 
				
			||||||
PORT = 8080
 | 
					PORT = 8080
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
PROTOCOL = ['http', 'https', 'ftp']
 | 
					PROTOCOL = ['http', 'https', 'ftp']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -127,19 +125,6 @@ def parseOptions(options):
 | 
				
			|||||||
    return out
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
 | 
					 | 
				
			||||||
                    crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
 | 
					 | 
				
			||||||
                    crawler.HTTPRefreshHandler()]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def custom_handler(accept, strict=False, delay=DELAY, encoding=None):
 | 
					 | 
				
			||||||
    handlers = default_handlers[:]
 | 
					 | 
				
			||||||
    handlers.append(crawler.EncodingFixHandler(encoding))
 | 
					 | 
				
			||||||
    handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept], strict))
 | 
					 | 
				
			||||||
    handlers.append(crawler.SQliteCacheHandler(delay))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return build_opener(*handlers)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def ItemFix(item, feedurl='/'):
 | 
					def ItemFix(item, feedurl='/'):
 | 
				
			||||||
    """ Improves feed items (absolute links, resolve feedburner links, etc) """
 | 
					    """ Improves feed items (absolute links, resolve feedburner links, etc) """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -267,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
 | 
				
			|||||||
        delay = -2
 | 
					        delay = -2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        con = custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
 | 
					        con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
 | 
				
			||||||
        data = con.read()
 | 
					        data = con.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    except (IOError, HTTPException) as e:
 | 
					    except (IOError, HTTPException) as e:
 | 
				
			||||||
@@ -368,7 +353,7 @@ def FeedFetch(url, options):
 | 
				
			|||||||
        delay = 0
 | 
					        delay = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        con = custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
 | 
					        con = crawler.custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2)
 | 
				
			||||||
        xml = con.read()
 | 
					        xml = con.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    except (HTTPError) as e:
 | 
					    except (HTTPError) as e:
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user