Compare commits

..

No commits in common. "bf86c1e9625cfb1273b67371bbc0f69265f93b19" and "e6811138fda1712aae3297a7cfcd7d1944e0abec" have entirely different histories.

2 changed files with 32 additions and 14 deletions

View File

@ -27,14 +27,13 @@ except NameError:
MIMETYPE = { MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'], 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml', 'application/xhtml+xml'],
'rss': ['application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
'html': ['text/html', 'application/xhtml+xml', 'application/xml']} 'html': ['text/html', 'application/xhtml+xml', 'application/xml']}
DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0' DEFAULT_UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0'
def custom_handler(follow=None, delay=None, encoding=None): def custom_handler(accept=None, strict=False, delay=None, encoding=None, basic=False):
handlers = [] handlers = []
# as per urllib2 source code, these Handelers are added first # as per urllib2 source code, these Handelers are added first
@ -52,12 +51,14 @@ def custom_handler(follow=None, delay=None, encoding=None):
handlers.append(HTTPEquivHandler()) handlers.append(HTTPEquivHandler())
handlers.append(HTTPRefreshHandler()) handlers.append(HTTPRefreshHandler())
handlers.append(UAHandler(DEFAULT_UA)) handlers.append(UAHandler(DEFAULT_UA))
handlers.append(AutoRefererHandler())
if not basic:
handlers.append(AutoRefererHandler())
handlers.append(EncodingFixHandler(encoding)) handlers.append(EncodingFixHandler(encoding))
if follow: if accept:
handlers.append(AlternateHandler(MIMETYPE[follow])) handlers.append(ContentNegociationHandler(MIMETYPE[accept], strict))
handlers.append(CacheHandler(force_min=delay)) handlers.append(CacheHandler(force_min=delay))
@ -197,28 +198,43 @@ class UAHandler(BaseHandler):
class AutoRefererHandler(BaseHandler): class AutoRefererHandler(BaseHandler):
def http_request(self, req): def http_request(self, req):
req.add_unredirected_header('Referer', '%s://%s' % (req.type, req.host)) req.add_unredirected_header('Referer', 'http://%s' % req.host)
return req return req
https_request = http_request https_request = http_request
class AlternateHandler(BaseHandler): class ContentNegociationHandler(BaseHandler):
" Follow <link rel='alternate' type='application/rss+xml' href='...' /> " " Handler for content negociation. Also parses <link rel='alternate' type='application/rss+xml' href='...' /> "
def __init__(self, follow=None): def __init__(self, accept=None, strict=False):
self.follow = follow or [] self.accept = accept
self.strict = strict
def http_request(self, req):
if self.accept is not None:
if isinstance(self.accept, basestring):
self.accept = (self.accept,)
string = ','.join(self.accept)
if self.strict:
string += ',*/*;q=0.9'
req.add_unredirected_header('Accept', string)
return req
def http_response(self, req, resp): def http_response(self, req, resp):
contenttype = resp.info().get('Content-Type', '').split(';')[0] contenttype = resp.info().get('Content-Type', '').split(';')[0]
if 200 <= resp.code < 300 and len(self.follow) and contenttype in MIMETYPE['html'] and contenttype not in self.follow: if 200 <= resp.code < 300 and self.accept is not None and self.strict and contenttype in MIMETYPE['html'] and contenttype not in self.accept:
# opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types # opps, not what we were looking for, let's see if the html page suggests an alternative page of the right types
data = resp.read() data = resp.read()
links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]') links = lxml.html.fromstring(data[:10000]).findall('.//link[@rel="alternate"]')
for link in links: for link in links:
if link.get('type', '') in self.follow: if link.get('type', '') in self.accept:
resp.code = 302 resp.code = 302
resp.msg = 'Moved Temporarily' resp.msg = 'Moved Temporarily'
resp.headers['location'] = link.get('href') resp.headers['location'] = link.get('href')
@ -230,6 +246,7 @@ class AlternateHandler(BaseHandler):
return resp return resp
https_request = http_request
https_response = http_response https_response = http_response

View File

@ -252,7 +252,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2 delay = -2
try: try:
con = crawler.custom_handler(delay=delay, encoding=options.encoding).open(link, timeout=TIMEOUT) con = crawler.custom_handler('html', False, delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read() data = con.read()
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
@ -335,7 +335,8 @@ def FeedFetch(url, options):
delay = 0 delay = 0
try: try:
con = crawler.custom_handler(follow='rss', delay=delay, encoding=options.encoding) \ con = crawler.custom_handler(accept='xml', strict=True, delay=delay,
encoding=options.encoding, basic=not options.items) \
.open(url, timeout=TIMEOUT * 2) .open(url, timeout=TIMEOUT * 2)
xml = con.read() xml = con.read()