Extend urllib2 to download pages, use gzip

Cleaner than dirty function. Handles decoding, gzip decompression, meta redirects (eg. Washington Post). Might need extra testing.
master
pictuga 2013-07-16 23:33:45 +02:00
parent 1fa8c4c535
commit 918dede4be
1 changed files with 61 additions and 24 deletions

View File

@ -16,10 +16,12 @@ import feeds
import urllib2 import urllib2
import socket import socket
from cookielib import CookieJar
import chardet import chardet
import urlparse import urlparse
from gzip import GzipFile
from StringIO import StringIO
from readability import readability from readability import readability
LIM_ITEM = 100 # deletes what's beyond LIM_ITEM = 100 # deletes what's beyond
@ -182,25 +184,59 @@ class Cache:
return time.time() - os.path.getmtime(self._file) < sec return time.time() - os.path.getmtime(self._file) < sec
def EncDownload(url): class HTMLDownloader(urllib2.HTTPCookieProcessor):
try: """
cj = CookieJar() Custom urllib2 handler to download html pages, following <meta> redirects,
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) using a browser user-agent and storing cookies.
opener.addheaders = [('User-Agent', UA_HML)] """
con = opener.open(url, timeout=TIMEOUT) def __init__(self, cookiejar=None):
data = con.read() urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error: self.userAgent = UA_HML
log(error)
return False
# meta-redirect def http_request(self, req):
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data) urllib2.HTTPCookieProcessor.http_request(self, req)
if match: req.add_header('Accept-Encoding', 'gzip')
new_url = match.groups()[0] return req
log('redirect: %s' % new_url)
return EncDownload(new_url)
# encoding def http_response(self, req, resp):
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
data = resp.read()
# gzip
if resp.headers.get('Content-Encoding') == 'gzip':
log('un-gzip')
data = GzipFile(fileobj=StringIO(data), mode='r').read()
# <meta> redirect
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
if match:
newurl = match.groups()[0]
log('redirect: %s' % newurl)
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ('content-length', 'content-type'))
new = urllib2.Request(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
return self.parent.open(new, timeout=req.timeout)
# decode
data = decodeHTML(resp, data)
fp = StringIO(data)
old_resp = resp
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
https_response = http_response
https_request = http_request
def decodeHTML(con, data):
if con.headers.getparam('charset'): if con.headers.getparam('charset'):
log('header') log('header')
enc = con.headers.getparam('charset') enc = con.headers.getparam('charset')
@ -214,7 +250,7 @@ def EncDownload(url):
enc = chardet.detect(data)['encoding'] enc = chardet.detect(data)['encoding']
log(enc) log(enc)
return (data.decode(enc, 'replace'), con.geturl()) return data.decode(enc, 'replace')
def Fill(item, cache, feedurl='/', fast=False): def Fill(item, cache, feedurl='/', fast=False):
""" Returns True when it has done its best """ """ Returns True when it has done its best """
@ -290,16 +326,17 @@ def Fill(item, cache, feedurl='/', fast=False):
return False return False
# download # download
ddl = EncDownload(item.link.encode('utf-8')) try:
url = item.link.encode('utf-8')
if ddl is False: con = urllib2.build_opener(HTMLDownloader()).open(url, timeout=TIMEOUT)
data = con.read()
except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
log('http error') log('http error')
cache.set(item.link, 'error-http') cache.set(item.link, 'error-http')
return True return True
data, url = ddl out = readability.Document(data, url=con.url).summary(True)
out = readability.Document(data, url=url).summary(True)
if countWord(out) > max(count_content, count_desc) > 0: if countWord(out) > max(count_content, count_desc) > 0:
setContent(item, out) setContent(item, out)
cache.set(item.link, out) cache.set(item.link, out)