#! /usr/bin/env python2.7 import sys import os from os.path import expanduser from lxml import etree import string import urllib2 import urllib from cookielib import CookieJar def log(txt): if os.getenv('DEBUG', False): print txt def xmlclean(xml): table = string.maketrans('', '') return xml.translate(table, table[:32]) node = sys.argv[1] if len(sys.argv) > 1 else "//h1/.." xml = xmlclean(sys.stdin.read()) rss = etree.XML(xml) items = rss.xpath('//item') cache = expanduser("~") + "/.cache/morss" if not os.path.exists(cache): os.makedirs(cache) for item in items: link = item.findtext('link').encode('utf-8') desc = item.xpath('description')[0] log(link) cached = cache + "/" + str(hash(link)) log(cached) if os.path.exists(cached): log("cached") desc.text = open(cached, 'r').read() else: try: cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) data = opener.open(link).read() html = etree.HTML(data) match = html.xpath(node) if len(match): try: text = etree.tostring(match[0]) log("ok txt") except etree.SerialisationError: log('serialisation') continue try: desc.text = text open(cached, 'w').write(text) except ValueError: log('xml error') else: log("no match") except (urllib2.HTTPError, urllib2.URLError) as error: log(error) log("http error") if not os.getenv('DEBUG', False): print etree.tostring(rss)