morss/morss

70 lines
1.4 KiB
Plaintext
Raw Normal View History

2013-02-25 14:50:32 +00:00
#! /usr/bin/env python2.7
import sys
2013-02-25 17:01:59 +00:00
import os
from os.path import expanduser
2013-02-25 14:50:32 +00:00
from lxml import etree
2013-03-01 13:26:51 +00:00
import string
2013-02-25 14:50:32 +00:00
import urllib2
import urllib
from cookielib import CookieJar
2013-02-25 17:01:59 +00:00
def log(txt):
if os.getenv('DEBUG', False):
print txt
2013-02-25 14:50:32 +00:00
def xmlclean(xml):
2013-03-01 13:26:51 +00:00
table = string.maketrans('', '')
return xml.translate(table, table[:32])
node = sys.argv[1] if len(sys.argv) > 1 else "//h1/.."
2013-02-25 14:50:32 +00:00
xml = xmlclean(sys.stdin.read())
rss = etree.XML(xml)
2013-02-25 14:50:32 +00:00
items = rss.xpath('//item')
2013-02-25 17:01:59 +00:00
cache = expanduser("~") + "/.cache/morss"
if not os.path.exists(cache):
2013-02-25 17:32:23 +00:00
os.makedirs(cache)
2013-02-25 14:50:32 +00:00
for item in items:
link = item.findtext('link').encode('utf-8')
2013-02-25 14:50:32 +00:00
desc = item.xpath('description')[0]
2013-02-25 17:01:59 +00:00
log(link)
2013-02-25 14:50:32 +00:00
2013-02-25 17:01:59 +00:00
cached = cache + "/" + str(hash(link))
2013-02-25 17:32:23 +00:00
log(cached)
2013-02-25 17:01:59 +00:00
if os.path.exists(cached):
log("cached")
desc.text = open(cached, 'r').read()
else:
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
data = opener.open(link).read()
2013-02-25 17:01:59 +00:00
html = etree.HTML(data)
match = html.xpath(node)
2013-02-25 14:50:32 +00:00
2013-02-25 17:01:59 +00:00
if len(match):
2013-02-25 17:32:23 +00:00
try:
text = etree.tostring(match[0])
log("ok txt")
except etree.SerialisationError:
log('serialisation')
continue
try:
desc.text = text
open(cached, 'w').write(text)
except ValueError:
2013-02-25 19:53:22 +00:00
log('xml error')
2013-02-25 17:01:59 +00:00
else:
log("no match")
2013-02-25 14:50:32 +00:00
2013-03-29 19:06:31 +00:00
except (urllib2.HTTPError, urllib2.URLError) as error:
2013-02-25 19:53:22 +00:00
log(error)
log("http error")
2013-02-25 14:50:32 +00:00
if not os.getenv('DEBUG', False):
2013-02-25 14:50:32 +00:00
print etree.tostring(rss)