Use one single urllib2 handler

Makes feedify on html pages more powerful
master
pictuga 2013-10-21 21:17:52 +02:00
parent 4d2d17f9e1
commit a1f9fe24c8
1 changed files with 28 additions and 39 deletions

View File

@ -151,21 +151,41 @@ class Cache:
return time.time() - os.path.getmtime(self._file) < sec return time.time() - os.path.getmtime(self._file) < sec
class HTMLDownloader(urllib2.HTTPCookieProcessor): class SimpleDownload(urllib2.HTTPCookieProcessor):
""" """
Custom urllib2 handler to download html pages, following <meta> redirects, Custom urllib2 handler to download a page, using etag/last-modified headers,
using a browser user-agent and storing cookies. to save bandwidth. The given headers are added back into the header on error
304 for easier use.
""" """
def __init__(self, useragent=UA_HTML, cookiejar=None): def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_HTML, decode=False, cookiejar=None):
urllib2.HTTPCookieProcessor.__init__(self, cookiejar) urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent self.useragent = useragent
self.decode = decode
def http_request(self, req): def http_request(self, req):
urllib2.HTTPCookieProcessor.http_request(self, req) urllib2.HTTPCookieProcessor.http_request(self, req)
req.add_unredirected_header('Accept-Encoding', 'gzip') req.add_unredirected_header('Accept-Encoding', 'gzip')
req.add_unredirected_header('User-Agent', self.useragent) req.add_unredirected_header('User-Agent', self.useragent)
req.add_unredirected_header('Referer', 'http://%s' % req.get_host())
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
return req return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
def http_response(self, req, resp): def http_response(self, req, resp):
urllib2.HTTPCookieProcessor.http_response(self, req, resp) urllib2.HTTPCookieProcessor.http_response(self, req, resp)
@ -194,6 +214,7 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
return self.parent.open(new, timeout=req.timeout) return self.parent.open(new, timeout=req.timeout)
# decode # decode
if self.decode:
data = decodeHTML(data, resp) data = decodeHTML(data, resp)
fp = StringIO(data) fp = StringIO(data)
@ -205,38 +226,6 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
https_response = http_response https_response = http_response
https_request = http_request https_request = http_request
class CacheDownload(urllib2.BaseHandler):
"""
Custom urllib2 handler to download a page, using etag/last-modified headers,
to save bandwidth. The given headers are added back into the header on error
304 for easier use.
"""
def __init__(self, cache="", etag=None, lastmodified=None, useragent=UA_RSS):
self.cache = cache
self.etag = etag
self.lastmodified = lastmodified
self.useragent = useragent
def http_request(self, req):
req.add_unredirected_header('User-Agent', self.useragent)
if self.cache:
if self.etag:
req.add_unredirected_header('If-None-Match', self.etag)
if self.lastmodified:
req.add_unredirected_header('If-Modified-Since', self.lastmodified)
return req
def http_error_304(self, req, fp, code, msg, headers):
log('http cached')
if self.etag:
headers.addheader('etag', self.etag)
if self.lastmodified:
headers.addheader('last-modified', self.lastmodified)
resp = urllib2.addinfourl(StringIO(self.cache), headers, req.get_full_url(), 200)
return resp
https_request = http_request
def decodeHTML(data, con=None): def decodeHTML(data, con=None):
if con is not None and con.headers.getparam('charset'): if con is not None and con.headers.getparam('charset'):
log('header') log('header')
@ -355,7 +344,7 @@ def Fill(item, cache, feedurl='/', fast=False):
# download # download
try: try:
url = link.encode('utf-8') url = link.encode('utf-8')
con = urllib2.build_opener(HTMLDownloader()).open(url, timeout=TIMEOUT) con = urllib2.build_opener(SimpleDownload()).open(url, timeout=TIMEOUT)
data = con.read() data = con.read()
except (urllib2.URLError, httplib.HTTPException, socket.timeout): except (urllib2.URLError, httplib.HTTPException, socket.timeout):
log('http error') log('http error')
@ -394,7 +383,7 @@ def Gather(url, cachePath, options):
style = cache.get('style') style = cache.get('style')
else: else:
try: try:
opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), useragent=UA_HTML) opener = SimpleDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'), decode=False)
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT) con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
xml = con.read() xml = con.read()
except (urllib2.URLError, httplib.HTTPException, socket.timeout): except (urllib2.URLError, httplib.HTTPException, socket.timeout):