From e5f8e4365927d50f6a9512b4dfed8e1529d60159 Mon Sep 17 00:00:00 2001
From: pictuga <contact@pictuga.com>
Date: Wed, 8 Mar 2017 18:03:34 -1000
Subject: [PATCH] Shifted the <link rel='alternate'/> redirect to crawler

Now using MIMETYPE var from crawler within morss.py
---
 morss/crawler.py | 47 ++++++++++++++++++++++++++++-------------------
 morss/morss.py   | 24 +++++-------------------
 2 files changed, 33 insertions(+), 38 deletions(-)
diff --git a/morss/crawler.py b/morss/crawler.py
index b8d8dd9..263d110 100644
--- a/morss/crawler.py
+++ b/morss/crawler.py
@@ -113,7 +113,9 @@ class AutoRefererHandler(BaseHandler):
     https_request = http_request
 
 
-class ContentNegociationHandler(BaseHandler): #FIXME
+class ContentNegociationHandler(BaseHandler):
+    " Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
+
     def __init__(self, accept=None, strict=False):
         self.accept = accept
         self.strict = strict
@@ -123,31 +125,38 @@ class ContentNegociationHandler(BaseHandler): #FIXME
             if isinstance(self.accept, basestring):
                 self.accept = (self.accept,)
 
-            out = {}
-            rank = 1.1
-            for group in self.accept:
-                rank -= 0.1
+            string = ','.join(self.accept)
 
-                if isinstance(group, basestring):
-                    if group in MIMETYPE:
-                        group = MIMETYPE[group]
-                    else:
-                        out[group] = rank
-                        continue
+            if self.strict:
+                string += ',*/*;q=0.9'
 
-                for mime in group:
-                    if mime not in out:
-                        out[mime] = rank
-
-            if not self.strict:
-                out['*/*'] = rank - 0.1
-
-            string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
             req.add_unredirected_header('Accept', string)
 
         return req
 
+    def http_response(self, req, resp):
+        contenttype = resp.info().get('Content-Type', '').split(';')[0]
+        if 200 <= resp.code < 300 and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
+            # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
+
+            data = resp.read()
+            links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
+
+            for link in links:
+                if link.get('type', '') in self.accept:
+                    resp.code = 302
+                    resp.msg = 'Moved Temporarily'
+                    resp.headers['location'] = link.get('href')
+
+            fp = BytesIO(data)
+            old_resp = resp
+            resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+
+        return resp
+
     https_request = http_request
+    https_response = http_response
 
 
 class HTTPEquivHandler(BaseHandler):
diff --git a/morss/morss.py b/morss/morss.py
index 1d917dd..af874f1 100644
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -50,10 +50,6 @@ PORT = 8080
 
 DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
 
-MIMETYPE = {
-    'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
-    'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
-
 PROTOCOL = ['http', 'https', 'ftp']
 
 
@@ -137,7 +133,7 @@ default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
 
 def custom_handler(accept, delay=DELAY):
     handlers = default_handlers[:]
-    handlers.append(crawler.ContentNegociationHandler(accept))
+    handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept]))
     handlers.append(crawler.SQliteCacheHandler(delay))
 
     return build_opener(*handlers)
@@ -270,7 +266,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
         delay = -2
 
     try:
-        con = custom_handler(('html', 'text/*'), delay).open(link, timeout=TIMEOUT)
+        con = custom_handler('html', delay).open(link, timeout=TIMEOUT)
         data = con.read()
 
     except (IOError, HTTPException) as e:
@@ -278,7 +274,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
         return False # let's just delete errors stuff when in cache mode
 
     contenttype = con.info().get('Content-Type', '').split(';')[0]
-    if contenttype not in MIMETYPE['html'] and contenttype != 'text/plain':
+    if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
         log('non-text page')
         return True
 
@@ -371,7 +367,7 @@ def FeedFetch(url, options):
         delay = 0
 
     try:
-        con = custom_handler(('xml', 'html'), delay).open(url, timeout=TIMEOUT * 2)
+        con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2)
         xml = con.read()
 
     except (HTTPError) as e:
@@ -387,7 +383,7 @@ def FeedFetch(url, options):
         log('itunes redirect: %s' % link)
         return FeedFetch(link, options)
 
-    elif re.match(b'\s*<?xml', xml) is not None or contenttype in MIMETYPE['xml']:
+    elif re.match(b'\s*<?xml', xml) is not None or contenttype in crawler.MIMETYPE['xml']:
         rss = feeds.parse(xml)
 
     elif feedify.supported(url):
@@ -395,16 +391,6 @@ def FeedFetch(url, options):
         feed.build()
         rss = feed.feed
 
-    elif contenttype in MIMETYPE['html']:
-        match = lxml.html.fromstring(xml).xpath(
-            "//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
-        if len(match):
-            link = urljoin(url, match[0])
-            log('rss redirect: %s' % link)
-            return FeedFetch(link, options)
-        else:
-            log('no-link html')
-            raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
     else:
         log('random page')
         log(contenttype)