Extend urllib2 to download pages, use gzip
Cleaner than dirty function. Handles decoding, gzip decompression, meta redirects (eg. Washington Post). Might need extra testing.master
parent
1fa8c4c535
commit
918dede4be
85
morss.py
85
morss.py
|
@ -16,10 +16,12 @@ import feeds
|
||||||
|
|
||||||
import urllib2
|
import urllib2
|
||||||
import socket
|
import socket
|
||||||
from cookielib import CookieJar
|
|
||||||
import chardet
|
import chardet
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
from gzip import GzipFile
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
from readability import readability
|
from readability import readability
|
||||||
|
|
||||||
LIM_ITEM = 100 # deletes what's beyond
|
LIM_ITEM = 100 # deletes what's beyond
|
||||||
|
@ -182,25 +184,59 @@ class Cache:
|
||||||
|
|
||||||
return time.time() - os.path.getmtime(self._file) < sec
|
return time.time() - os.path.getmtime(self._file) < sec
|
||||||
|
|
||||||
def EncDownload(url):
|
class HTMLDownloader(urllib2.HTTPCookieProcessor):
|
||||||
try:
|
"""
|
||||||
cj = CookieJar()
|
Custom urllib2 handler to download html pages, following <meta> redirects,
|
||||||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
using a browser user-agent and storing cookies.
|
||||||
opener.addheaders = [('User-Agent', UA_HML)]
|
"""
|
||||||
con = opener.open(url, timeout=TIMEOUT)
|
def __init__(self, cookiejar=None):
|
||||||
data = con.read()
|
urllib2.HTTPCookieProcessor.__init__(self, cookiejar)
|
||||||
except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
|
self.userAgent = UA_HML
|
||||||
log(error)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# meta-redirect
|
def http_request(self, req):
|
||||||
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
urllib2.HTTPCookieProcessor.http_request(self, req)
|
||||||
if match:
|
req.add_header('Accept-Encoding', 'gzip')
|
||||||
new_url = match.groups()[0]
|
return req
|
||||||
log('redirect: %s' % new_url)
|
|
||||||
return EncDownload(new_url)
|
|
||||||
|
|
||||||
# encoding
|
def http_response(self, req, resp):
|
||||||
|
urllib2.HTTPCookieProcessor.http_response(self, req, resp)
|
||||||
|
|
||||||
|
if 200 <= resp.code < 300 and resp.info().maintype == 'text':
|
||||||
|
data = resp.read()
|
||||||
|
|
||||||
|
# gzip
|
||||||
|
if resp.headers.get('Content-Encoding') == 'gzip':
|
||||||
|
log('un-gzip')
|
||||||
|
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
||||||
|
|
||||||
|
# <meta> redirect
|
||||||
|
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
||||||
|
if match:
|
||||||
|
newurl = match.groups()[0]
|
||||||
|
log('redirect: %s' % newurl)
|
||||||
|
|
||||||
|
newheaders = dict((k,v) for k,v in req.headers.items()
|
||||||
|
if k.lower() not in ('content-length', 'content-type'))
|
||||||
|
new = urllib2.Request(newurl,
|
||||||
|
headers=newheaders,
|
||||||
|
origin_req_host=req.get_origin_req_host(),
|
||||||
|
unverifiable=True)
|
||||||
|
|
||||||
|
return self.parent.open(new, timeout=req.timeout)
|
||||||
|
|
||||||
|
# decode
|
||||||
|
data = decodeHTML(resp, data)
|
||||||
|
|
||||||
|
fp = StringIO(data)
|
||||||
|
old_resp = resp
|
||||||
|
resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
|
||||||
|
resp.msg = old_resp.msg
|
||||||
|
return resp
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
https_request = http_request
|
||||||
|
|
||||||
|
def decodeHTML(con, data):
|
||||||
if con.headers.getparam('charset'):
|
if con.headers.getparam('charset'):
|
||||||
log('header')
|
log('header')
|
||||||
enc = con.headers.getparam('charset')
|
enc = con.headers.getparam('charset')
|
||||||
|
@ -214,7 +250,7 @@ def EncDownload(url):
|
||||||
enc = chardet.detect(data)['encoding']
|
enc = chardet.detect(data)['encoding']
|
||||||
|
|
||||||
log(enc)
|
log(enc)
|
||||||
return (data.decode(enc, 'replace'), con.geturl())
|
return data.decode(enc, 'replace')
|
||||||
|
|
||||||
def Fill(item, cache, feedurl='/', fast=False):
|
def Fill(item, cache, feedurl='/', fast=False):
|
||||||
""" Returns True when it has done its best """
|
""" Returns True when it has done its best """
|
||||||
|
@ -290,16 +326,17 @@ def Fill(item, cache, feedurl='/', fast=False):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# download
|
# download
|
||||||
ddl = EncDownload(item.link.encode('utf-8'))
|
try:
|
||||||
|
url = item.link.encode('utf-8')
|
||||||
if ddl is False:
|
con = urllib2.build_opener(HTMLDownloader()).open(url, timeout=TIMEOUT)
|
||||||
|
data = con.read()
|
||||||
|
except (urllib2.HTTPError, urllib2.URLError, socket.timeout) as error:
|
||||||
log('http error')
|
log('http error')
|
||||||
cache.set(item.link, 'error-http')
|
cache.set(item.link, 'error-http')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
data, url = ddl
|
out = readability.Document(data, url=con.url).summary(True)
|
||||||
|
|
||||||
out = readability.Document(data, url=url).summary(True)
|
|
||||||
if countWord(out) > max(count_content, count_desc) > 0:
|
if countWord(out) > max(count_content, count_desc) > 0:
|
||||||
setContent(item, out)
|
setContent(item, out)
|
||||||
cache.set(item.link, out)
|
cache.set(item.link, out)
|
||||||
|
|
Loading…
Reference in New Issue