From e5f8e4365927d50f6a9512b4dfed8e1529d60159 Mon Sep 17 00:00:00 2001 From: pictuga Date: Wed, 8 Mar 2017 18:03:34 -1000 Subject: [PATCH] Shifted the redirect to crawler Now using MIMETYPE var from crawler within morss.py --- morss/crawler.py | 47 ++++++++++++++++++++++++++++------------------- morss/morss.py | 24 +++++------------------- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index b8d8dd9..263d110 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -113,7 +113,9 @@ class AutoRefererHandler(BaseHandler): https_request = http_request -class ContentNegociationHandler(BaseHandler): #FIXME +class ContentNegociationHandler(BaseHandler): + " Handler for content negociation. Also parses " + def __init__(self, accept=None, strict=False): self.accept = accept self.strict = strict @@ -123,31 +125,38 @@ class ContentNegociationHandler(BaseHandler): #FIXME if isinstance(self.accept, basestring): self.accept = (self.accept,) - out = {} - rank = 1.1 - for group in self.accept: - rank -= 0.1 + string = ','.join(self.accept) - if isinstance(group, basestring): - if group in MIMETYPE: - group = MIMETYPE[group] - else: - out[group] = rank - continue + if self.strict: + string += ',*/*;q=0.9' - for mime in group: - if mime not in out: - out[mime] = rank - - if not self.strict: - out['*/*'] = rank - 0.1 - - string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out]) req.add_unredirected_header('Accept', string) return req + def http_response(self, req, resp): + contenttype = resp.info().get('Content-Type', '').split(';')[0] + if 200 <= resp.code < 300 and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept: + # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types + + data = resp.read() + links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') + + for link in links: + if link.get('type', '') in self.accept: + resp.code = 302 + resp.msg = 'Moved Temporarily' + resp.headers['location'] = link.get('href') + + fp = BytesIO(data) + old_resp = resp + resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + + return resp + https_request = http_request + https_response = http_response class HTTPEquivHandler(BaseHandler): diff --git a/morss/morss.py b/morss/morss.py index 1d917dd..af874f1 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -50,10 +50,6 @@ PORT = 8080 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' -MIMETYPE = { - 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], - 'html': ['text/html', 'application/xhtml+xml', 'application/xml']} - PROTOCOL = ['http', 'https', 'ftp'] @@ -137,7 +133,7 @@ default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), def custom_handler(accept, delay=DELAY): handlers = default_handlers[:] - handlers.append(crawler.ContentNegociationHandler(accept)) + handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept])) handlers.append(crawler.SQliteCacheHandler(delay)) return build_opener(*handlers) @@ -270,7 +266,7 @@ def ItemFill(item, options, feedurl='/', fast=False): delay = -2 try: - con = custom_handler(('html', 'text/*'), delay).open(link, timeout=TIMEOUT) + con = custom_handler('html', delay).open(link, timeout=TIMEOUT) data = con.read() except (IOError, HTTPException) as e: @@ -278,7 +274,7 @@ def ItemFill(item, options, feedurl='/', fast=False): return False # let's just delete errors stuff when in cache mode contenttype = con.info().get('Content-Type', '').split(';')[0] - if contenttype not in MIMETYPE['html'] and contenttype != 'text/plain': + if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain': log('non-text page') return True @@ -371,7 +367,7 @@ def FeedFetch(url, options): delay = 0 try: - con = custom_handler(('xml', 'html'), delay).open(url, timeout=TIMEOUT * 2) + con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2) xml = con.read() except (HTTPError) as e: @@ -387,7 +383,7 @@ def FeedFetch(url, options): log('itunes redirect: %s' % link) return FeedFetch(link, options) - elif re.match(b'\s*