From 1b7777c33121162a0e3df627d0e934dea5e69cf6 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sun, 15 Sep 2013 19:19:50 +0200 Subject: [PATCH] Find RSS links within html pages' And cache those links --- morss.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/morss.py b/morss.py index dfd4b2c..ce559bb 100644 --- a/morss.py +++ b/morss.py @@ -369,18 +369,38 @@ def Gather(url, cachePath, progress=False): cache = Cache(cachePath, url) # fetch feed - if cache.isYoungerThan(DELAY) and url in cache: - log('xml cached') - xml = cache.get(url) + if cache.isYoungerThan(DELAY): + if 'xml' in cache: + log('xml cached') + xml = cache.get('xml') + if 'link' in cache: + log('link cached') + return Gather(cache.get('link'), cachePath, progress) else: try: opener = CacheDownload(cache.get(url), cache.get('etag'), cache.get('lastmodified')) con = urllib2.build_opener(opener).open(url) xml = con.read() - cache.set(url, xml) + except (urllib2.URLError, httplib.HTTPException, socket.timeout): + return False + + if con.info().type in ['text/xml', 'application/xml', 'application/rss+xml', + 'application/rdf+xml', 'application/atom+xml']: + cache.set('xml', xml) cache.set('etag', con.headers.getheader('etag')) cache.set('lastmodified', con.headers.getheader('last-modified')) - except (urllib2.URLError, httplib.HTTPException, socket.timeout): + elif con.info().type in ['text/html', 'application/xhtml+xml']: + match = lxml.html.fromstring(xml).xpath("//link[@rel='alternate'][@type='application/rss+xml' or @type='application/atom+xml']/@href") + if len(match): + link = urlparse.urljoin(url, match[0]) + cache.set('link', link) + return Gather(link, cachePath, progress) + else: + log('no-link html') + return False + else: + log(con.info().type) + log('random page') return False rss = feeds.parse(xml)