More flexible xml caching
New includes a 'type' var, to remember what we did out of it (normal, nothing, grabbed xml link, etc). xml/html mimetype are now saved in a dict, for easier editing, and consistency.master
parent
edff54a016
commit
9bc4417be3
36
morss.py
36
morss.py
|
@ -36,6 +36,9 @@ DEBUG = False
|
||||||
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
|
UA_RSS = 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)'
|
||||||
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
UA_HTML = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
|
||||||
|
|
||||||
|
MIMETYPE = { 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],
|
||||||
|
'html': ['text/html', 'application/xhtml+xml']}
|
||||||
|
|
||||||
PROTOCOL = ['http', 'https', 'ftp']
|
PROTOCOL = ['http', 'https', 'ftp']
|
||||||
|
|
||||||
if 'REQUEST_URI' in os.environ:
|
if 'REQUEST_URI' in os.environ:
|
||||||
|
@ -173,7 +176,7 @@ class HTMLDownloader(urllib2.HTTPCookieProcessor):
|
||||||
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
data = GzipFile(fileobj=StringIO(data), mode='r').read()
|
||||||
|
|
||||||
# <meta> redirect
|
# <meta> redirect
|
||||||
if resp.info().type in ['text/html', 'application/xhtml+xml']:
|
if resp.info().type in MIMETYPE['html']:
|
||||||
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
|
||||||
if match:
|
if match:
|
||||||
newurl = match.groups()[0]
|
newurl = match.groups()[0]
|
||||||
|
@ -356,40 +359,49 @@ def Gather(url, cachePath, progress=False):
|
||||||
log(cache._hash)
|
log(cache._hash)
|
||||||
|
|
||||||
# fetch feed
|
# fetch feed
|
||||||
if cache.isYoungerThan(DELAY):
|
if cache.isYoungerThan(DELAY) and 'xml' in cache and 'style' in cache:
|
||||||
if 'xml' in cache:
|
|
||||||
log('xml cached')
|
log('xml cached')
|
||||||
xml = cache.get('xml')
|
xml = cache.get('xml')
|
||||||
if 'link' in cache:
|
style = cache.get('style')
|
||||||
log('link cached')
|
|
||||||
return Gather(cache.get('link'), cachePath, progress)
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified'))
|
||||||
con = urllib2.build_opener(opener).open(url)
|
con = urllib2.build_opener(opener).open(url, timeout=TIMEOUT)
|
||||||
xml = con.read()
|
xml = con.read()
|
||||||
except (urllib2.URLError, httplib.HTTPException, socket.timeout):
|
except (urllib2.URLError, httplib.HTTPException, socket.timeout):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if xml[:5] == '<?xml' or con.info().type in ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml']:
|
|
||||||
cache.set('xml', xml)
|
cache.set('xml', xml)
|
||||||
cache.set('etag', con.headers.getheader('etag'))
|
cache.set('etag', con.headers.getheader('etag'))
|
||||||
cache.set('lastmodified', con.headers.getheader('last-modified'))
|
cache.set('lastmodified', con.headers.getheader('last-modified'))
|
||||||
elif con.info().type in ['text/html', 'application/xhtml+xml']:
|
|
||||||
|
if xml[:5] == '<?xml' or con.info().type in MIMETYPE['xml']:
|
||||||
|
style = 'normal'
|
||||||
|
elif con.info().type in MIMETYPE['html']:
|
||||||
|
style = 'html'
|
||||||
|
else:
|
||||||
|
style = 'none'
|
||||||
|
log(con.info().type)
|
||||||
|
|
||||||
|
cache.set('style', style)
|
||||||
|
|
||||||
|
log(style)
|
||||||
|
|
||||||
|
if style == 'normal':
|
||||||
|
rss = feeds.parse(xml)
|
||||||
|
rss = feedify.build(url, xml)
|
||||||
|
elif style == 'html':
|
||||||
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href")
|
||||||
if len(match):
|
if len(match):
|
||||||
link = urlparse.urljoin(url, match[0])
|
link = urlparse.urljoin(url, match[0])
|
||||||
cache.set('link', link)
|
|
||||||
return Gather(link, cachePath, progress)
|
return Gather(link, cachePath, progress)
|
||||||
else:
|
else:
|
||||||
log('no-link html')
|
log('no-link html')
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
log(con.info().type)
|
|
||||||
log('random page')
|
log('random page')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
rss = feeds.parse(xml)
|
|
||||||
size = len(rss.items)
|
size = len(rss.items)
|
||||||
|
|
||||||
# set
|
# set
|
||||||
|
|
Loading…
Reference in New Issue