Extend urllib2 to download pages, use gzip

Cleaner than dirty function. Handles decoding, gzip decompression, meta redirects (eg. Washington Post). Might need extra testing.
2013-07-16 23:33:45 +02:00 · 2013-07-16 23:33:45 +02:00 · 918dede4be
parent 1fa8c4c535
commit 918dede4be
1 changed files with 61 additions and 24 deletions
--- a/morss.py
+++ b/morss.py
@ -16,10 +16,12 @@ import feeds

 import urllib2
 import socket
-from cookielib import CookieJar
 import chardet
 import urlparse

+from gzip import GzipFile
+from StringIO import StringIO
+
 from readability import readability

 LIM_ITEM = 100	# deletes what's beyond
@ -182,25 +184,59 @@ class Cache:

 		return time.time() - os.path.getmtime(self._file) < sec

-def EncDownload(url):
-	try:
-		cj = CookieJar()
-		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
-		opener.addheaders = [('User-Agent', UA_HML)]
-		con = opener.open(url, timeout=TIMEOUT)
-		data = con.read()
-	except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
-		log(error)
-		return False
+class HTMLDownloader(urllib2.HTTPCookieProcessor):
+	"""
+	Custom urllib2 handler to download html pages, following <meta> redirects,
+	using a browser user-agent and storing cookies.
+	"""
+	def __init__(self, cookiejar=None):
+		urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
+		self.userAgent = UA_HML

-	# meta-redirect
+	def http_request(self, req):
+		urllib2.HTTPCookieProcessor.http_request(self, req)
+		req.add_header('Accept-Encoding', 'gzip')
+		return req
+
+	def http_response(self, req, resp):
+		urllib2.HTTPCookieProcessor.http_response(self, req, resp)
+
+		if 200 <= resp.code < 300 and resp.info().maintype == 'text':
+			data = resp.read()
+
+			# gzip
+			if resp.headers.get('Content-Encoding') == 'gzip':
+				log('un-gzip')
+				data = GzipFile(fileobj=StringIO(data), mode='r').read()
+
+			# <meta> redirect
 			match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
 			if match:
-		new_url = match.groups()[0]
-		log('redirect: %s' % new_url)
-		return EncDownload(new_url)
+				newurl = match.groups()[0]
+				log('redirect: %s' % newurl)

-	# encoding
+				newheaders = dict((k,v) for k,v in req.headers.items()
+					if k.lower() not in ('content-length', 'content-type'))
+				new = urllib2.Request(newurl,
+					headers=newheaders,
+					origin_req_host=req.get_origin_req_host(),
+					unverifiable=True)
+
+				return self.parent.open(new, timeout=req.timeout)
+
+			# decode
+			data = decodeHTML(resp, data)
+
+			fp = StringIO(data)
+			old_resp = resp
+			resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
+			resp.msg = old_resp.msg
+		return resp
+
+	https_response = http_response
+	https_request = http_request
+
+def decodeHTML(con, data):
 	if con.headers.getparam('charset'):
 		log('header')
 		enc = con.headers.getparam('charset')
@ -214,7 +250,7 @@ def EncDownload(url):
 			enc = chardet.detect(data)['encoding']

 	log(enc)
-	return (data.decode(enc, 'replace'), con.geturl())
+	return data.decode(enc, 'replace')

 def Fill(item, cache, feedurl='/', fast=False):
 	""" Returns True when it has done its best """
@ -290,16 +326,17 @@ def Fill(item, cache, feedurl='/', fast=False):
 		return False

 	# download
-	ddl = EncDownload(item.link.encode('utf-8'))
-
-	if ddl is False:
+	try:
+		url = item.link.encode('utf-8')
+		con = urllib2.build_opener(HTMLDownloader()).open(url, timeout=TIMEOUT)
+		data = con.read()
+	except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
 		log('http error')
 		cache.set(item.link, 'error-http')
 		return True

-	data, url = ddl
+	out = readability.Document(data, url=con.url).summary(True)

-	out = readability.Document(data, url=url).summary(True)
 	if countWord(out) > max(count_content, count_desc) > 0:
 		setContent(item, out)
 		cache.set(item.link, out)