diff --git a/morss/crawler.py b/morss/crawler.py index 4a35012..d2dfc48 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -29,6 +29,28 @@ MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} + +DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' + + +def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False): + handlers = [] + + handlers.append(GZIPHandler()) + handlers.append(HTTPEquivHandler()) + handlers.append(HTTPRefreshHandler()) + + if not basic: + handlers.append(UAHandler(DEFAULT_UA)) + handlers.append(AutoRefererHandler()) + + handlers.append(EncodingFixHandler(encoding)) + handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict)) + handlers.append(SQliteCacheHandler(delay)) + + return build_opener(*handlers) + + class GZIPHandler(BaseHandler): def http_request(self, req): req.add_unredirected_header('Accept-Encoding', 'gzip') diff --git a/morss/morss.py b/morss/morss.py index f3ea9fd..33d16b4 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -48,8 +48,6 @@ THREADS = 10 # number of threads (1 for single-threaded) DEBUG = False PORT = 8080 -DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' - PROTOCOL = ['http', 'https', 'ftp'] @@ -127,19 +125,6 @@ def parseOptions(options): return out -default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), - crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(), - crawler.HTTPRefreshHandler()] - -def custom_handler(accept, strict=False, delay=DELAY, encoding=None): - handlers = default_handlers[:] - handlers.append(crawler.EncodingFixHandler(encoding)) - handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept], strict)) - handlers.append(crawler.SQliteCacheHandler(delay)) - - return build_opener(*handlers) - - def ItemFix(item, feedurl='/'): """ Improves feed items (absolute links, resolve feedburner links, etc) """ @@ -267,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False): delay = -2 try: - con = custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT) + con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT) data = con.read() except (IOError, HTTPException) as e: @@ -368,7 +353,7 @@ def FeedFetch(url, options): delay = 0 try: - con = custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2) + con = crawler.custom_handler('xml', True, delay, options.encoding).open(url, timeout=TIMEOUT * 2) xml = con.read() except (HTTPError) as e: