Shifted the <link rel='alternate'/> redirect to crawler
Now using MIMETYPE var from crawler within morss.pymaster
parent
fb8825b410
commit
e5f8e43659
|
@ -113,7 +113,9 @@ class AutoRefererHandler(BaseHandler):
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
|
||||||
|
|
||||||
class ContentNegociationHandler(BaseHandler): #FIXME
|
class ContentNegociationHandler(BaseHandler):
|
||||||
|
" Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
|
||||||
|
|
||||||
def __init__(self, accept=None, strict=False):
|
def __init__(self, accept=None, strict=False):
|
||||||
self.accept = accept
|
self.accept = accept
|
||||||
self.strict = strict
|
self.strict = strict
|
||||||
|
@ -123,31 +125,38 @@ class ContentNegociationHandler(BaseHandler): #FIXME
|
||||||
if isinstance(self.accept, basestring):
|
if isinstance(self.accept, basestring):
|
||||||
self.accept = (self.accept,)
|
self.accept = (self.accept,)
|
||||||
|
|
||||||
out = {}
|
string = ','.join(self.accept)
|
||||||
rank = 1.1
|
|
||||||
for group in self.accept:
|
|
||||||
rank -= 0.1
|
|
||||||
|
|
||||||
if isinstance(group, basestring):
|
if self.strict:
|
||||||
if group in MIMETYPE:
|
string += ',*/*;q=0.9'
|
||||||
group = MIMETYPE[group]
|
|
||||||
else:
|
|
||||||
out[group] = rank
|
|
||||||
continue
|
|
||||||
|
|
||||||
for mime in group:
|
|
||||||
if mime not in out:
|
|
||||||
out[mime] = rank
|
|
||||||
|
|
||||||
if not self.strict:
|
|
||||||
out['*/*'] = rank - 0.1
|
|
||||||
|
|
||||||
string = ','.join([x + ';q={0:.1}'.format(out[x]) if out[x] != 1 else x for x in out])
|
|
||||||
req.add_unredirected_header('Accept', string)
|
req.add_unredirected_header('Accept', string)
|
||||||
|
|
||||||
return req
|
return req
|
||||||
|
|
||||||
|
def http_response(self, req, resp):
|
||||||
|
contenttype = resp.info().get('Content-Type', '').split(';')[0]
|
||||||
|
if 200 <= resp.code < 300 and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
|
||||||
|
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
|
||||||
|
|
||||||
|
data = resp.read()
|
||||||
|
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
if link.get('type', '') in self.accept:
|
||||||
|
resp.code = 302
|
||||||
|
resp.msg = 'Moved Temporarily'
|
||||||
|
resp.headers['location'] = link.get('href')
|
||||||
|
|
||||||
|
fp = BytesIO(data)
|
||||||
|
old_resp = resp
|
||||||
|
resp = addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
|
resp.msg = old_resp.msg
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
https_request = http_request
|
https_request = http_request
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
|
||||||
class HTTPEquivHandler(BaseHandler):
|
class HTTPEquivHandler(BaseHandler):
|
||||||
|
|
|
@ -50,10 +50,6 @@ PORT = 8080
|
||||||
|
|
||||||
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
|
||||||
|
|
||||||
MIMETYPE = {
|
|
||||||
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
|
||||||
'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
|
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https', 'ftp']
|
||||||
|
|
||||||
|
|
||||||
|
@ -137,7 +133,7 @@ default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
||||||
|
|
||||||
def custom_handler(accept, delay=DELAY):
|
def custom_handler(accept, delay=DELAY):
|
||||||
handlers = default_handlers[:]
|
handlers = default_handlers[:]
|
||||||
handlers.append(crawler.ContentNegociationHandler(accept))
|
handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept]))
|
||||||
handlers.append(crawler.SQliteCacheHandler(delay))
|
handlers.append(crawler.SQliteCacheHandler(delay))
|
||||||
|
|
||||||
return build_opener(*handlers)
|
return build_opener(*handlers)
|
||||||
|
@ -270,7 +266,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
delay = -2
|
delay = -2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = custom_handler(('html', 'text/*'), delay).open(link, timeout=TIMEOUT)
|
con = custom_handler('html', delay).open(link, timeout=TIMEOUT)
|
||||||
data = con.read()
|
data = con.read()
|
||||||
|
|
||||||
except (IOError, HTTPException) as e:
|
except (IOError, HTTPException) as e:
|
||||||
|
@ -278,7 +274,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
return False # let's just delete errors stuff when in cache mode
|
return False # let's just delete errors stuff when in cache mode
|
||||||
|
|
||||||
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
contenttype = con.info().get('Content-Type', '').split(';')[0]
|
||||||
if contenttype not in MIMETYPE['html'] and contenttype != 'text/plain':
|
if contenttype not in crawler.MIMETYPE['html'] and contenttype != 'text/plain':
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -371,7 +367,7 @@ def FeedFetch(url, options):
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
con = custom_handler(('xml', 'html'), delay).open(url, timeout=TIMEOUT * 2)
|
con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2)
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
|
|
||||||
except (HTTPError) as e:
|
except (HTTPError) as e:
|
||||||
|
@ -387,7 +383,7 @@ def FeedFetch(url, options):
|
||||||
log('itunes redirect: %s' % link)
|
log('itunes redirect: %s' % link)
|
||||||
return FeedFetch(link, options)
|
return FeedFetch(link, options)
|
||||||
|
|
||||||
elif re.match(b'\s*<?xml', xml) is not None or contenttype in MIMETYPE['xml']:
|
elif re.match(b'\s*<?xml', xml) is not None or contenttype in crawler.MIMETYPE['xml']:
|
||||||
rss = feeds.parse(xml)
|
rss = feeds.parse(xml)
|
||||||
|
|
||||||
elif feedify.supported(url):
|
elif feedify.supported(url):
|
||||||
|
@ -395,16 +391,6 @@ def FeedFetch(url, options):
|
||||||
feed.build()
|
feed.build()
|
||||||
rss = feed.feed
|
rss = feed.feed
|
||||||
|
|
||||||
elif contenttype in MIMETYPE['html']:
|
|
||||||
match = lxml.html.fromstring(xml).xpath(
|
|
||||||
"//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
|
||||||
if len(match):
|
|
||||||
link = urljoin(url, match[0])
|
|
||||||
log('rss redirect: %s' % link)
|
|
||||||
return FeedFetch(link, options)
|
|
||||||
else:
|
|
||||||
log('no-link html')
|
|
||||||
raise MorssException('Link provided is an HTML page, which doesn\'t link to a feed')
|
|
||||||
else:
|
else:
|
||||||
log('random page')
|
log('random page')
|
||||||
log(contenttype)
|
log(contenttype)
|
||||||
|
|
Loading…
Reference in New Issue