Auto-detection of server-mode, better caching.
The SERVER variable is no longer needed. RSS .xml file is now cached for a very short time, so as to make loading faster, and hopefully reduce bann a little. Use a more common User-Agent to try to cut down bann. Added ability to test whether a key is in the Cache.master
parent
a616c96e32
commit
1480bd7af4
59
morss.py
59
morss.py
|
@ -3,6 +3,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
from base64 import b64encode, b64decode
|
from base64 import b64encode, b64decode
|
||||||
import os.path
|
import os.path
|
||||||
|
import time
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import lxml.objectify
|
import lxml.objectify
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
@ -18,6 +19,7 @@ from readability import readability
|
||||||
|
|
||||||
SERVER = True
|
SERVER = True
|
||||||
MAX = 70
|
MAX = 70
|
||||||
|
DELAY=10
|
||||||
|
|
||||||
ITEM_MAP = {
|
ITEM_MAP = {
|
||||||
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
|
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
|
||||||
|
@ -34,7 +36,7 @@ RSS_MAP = {
|
||||||
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
|
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
|
||||||
}
|
}
|
||||||
|
|
||||||
if SERVER:
|
if 'REQUEST_URI' in os.environ:
|
||||||
import httplib
|
import httplib
|
||||||
httplib.HTTPConnection.debuglevel = 1
|
httplib.HTTPConnection.debuglevel = 1
|
||||||
|
|
||||||
|
@ -42,9 +44,10 @@ if SERVER:
|
||||||
cgitb.enable()
|
cgitb.enable()
|
||||||
|
|
||||||
def log(txt):
|
def log(txt):
|
||||||
if not SERVER and os.getenv('DEBUG', False):
|
if not 'REQUEST_URI' in os.environ:
|
||||||
|
if os.getenv('DEBUG', False):
|
||||||
print txt
|
print txt
|
||||||
if SERVER:
|
else:
|
||||||
with open('morss.log', 'a') as file:
|
with open('morss.log', 'a') as file:
|
||||||
file.write(repr(txt).encode('utf-8') + "\n")
|
file.write(repr(txt).encode('utf-8') + "\n")
|
||||||
|
|
||||||
|
@ -72,8 +75,12 @@ class Cache:
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
def __contains__(self, key):
|
||||||
|
return key in self._cached
|
||||||
|
|
||||||
def get(self, key):
|
def get(self, key):
|
||||||
if key in self._cached:
|
if key in self._cached:
|
||||||
|
self._cache[key] = self._cached[key]
|
||||||
return b64decode(self._cached[key])
|
return b64decode(self._cached[key])
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
@ -98,6 +105,12 @@ class Cache:
|
||||||
|
|
||||||
open(self._file, 'w').write(txt)
|
open(self._file, 'w').write(txt)
|
||||||
|
|
||||||
|
def isYoungerThan(self, sec):
|
||||||
|
if not os.path.exists(self._file):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return os.path.getmtime(self._file) > time.time()-sec
|
||||||
|
|
||||||
class XMLMap(object):
|
class XMLMap(object):
|
||||||
"""
|
"""
|
||||||
Sort of wrapper around lxml.objectify.StringElement (from which this
|
Sort of wrapper around lxml.objectify.StringElement (from which this
|
||||||
|
@ -277,11 +290,9 @@ def Fill(rss, cache):
|
||||||
log(item.link)
|
log(item.link)
|
||||||
|
|
||||||
# check cache
|
# check cache
|
||||||
cached = cache.get(item.link)
|
if item.link in cache:
|
||||||
if cached is not None:
|
|
||||||
log('cached')
|
log('cached')
|
||||||
item.content = cached
|
item.content = cache.get(item.link)
|
||||||
cache.set(item.link, cached)
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
# download
|
# download
|
||||||
|
@ -298,22 +309,28 @@ def Fill(rss, cache):
|
||||||
item.content = out
|
item.content = out
|
||||||
cache.set(item.link, out)
|
cache.set(item.link, out)
|
||||||
|
|
||||||
def Gather(data, cachePath):
|
def Gather(url, cachePath):
|
||||||
|
cache = Cache(cachePath, url)
|
||||||
|
|
||||||
# fetch feed
|
# fetch feed
|
||||||
if data.startswith("http"):
|
if cache.isYoungerThan(DELAY*60) and url in cache:
|
||||||
req = urllib2.Request(data)
|
log('xml cached')
|
||||||
req.add_unredirected_header('User-Agent', '')
|
xml = cache.get(url)
|
||||||
xml = urllib2.urlopen(req).read()
|
|
||||||
else:
|
else:
|
||||||
xml = data
|
try:
|
||||||
|
req = urllib2.Request(url)
|
||||||
|
req.add_unredirected_header('User-Agent', 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)')
|
||||||
|
xml = urllib2.urlopen(req).read()
|
||||||
|
cache.set(url, xml)
|
||||||
|
except (urllib2.HTTPError, urllib2.URLError):
|
||||||
|
print "Error, couldn't fetch RSS feed (the server might be banned from the given website)."
|
||||||
|
return False
|
||||||
|
|
||||||
xml = cleanXML(xml)
|
xml = cleanXML(xml)
|
||||||
rss = lxml.objectify.fromstring(xml)
|
rss = lxml.objectify.fromstring(xml)
|
||||||
root = rss.channel if hasattr(rss, 'channel') else rss
|
root = rss.channel if hasattr(rss, 'channel') else rss
|
||||||
root = XMLMap(root, RSS_MAP)
|
root = XMLMap(root, RSS_MAP)
|
||||||
|
|
||||||
cache = Cache(cachePath, unicode(root.title))
|
|
||||||
|
|
||||||
# set
|
# set
|
||||||
if MAX:
|
if MAX:
|
||||||
for item in root.item[MAX:]:
|
for item in root.item[MAX:]:
|
||||||
|
@ -324,7 +341,7 @@ def Gather(data, cachePath):
|
||||||
return root.tostring(xml_declaration=True, encoding='UTF-8')
|
return root.tostring(xml_declaration=True, encoding='UTF-8')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if SERVER:
|
if 'REQUEST_URI' in os.environ:
|
||||||
print 'Status: 200'
|
print 'Status: 200'
|
||||||
print 'Content-Type: text/html\n'
|
print 'Content-Type: text/html\n'
|
||||||
|
|
||||||
|
@ -340,11 +357,15 @@ if __name__ == "__main__":
|
||||||
log(url)
|
log(url)
|
||||||
RSS = Gather(url, cache)
|
RSS = Gather(url, cache)
|
||||||
else:
|
else:
|
||||||
xml = sys.stdin.read()
|
if len(sys.argv) > 1 and sys.argv[1].startswith('http'):
|
||||||
|
url = sys.argv[1]
|
||||||
cache = os.path.expanduser('~') + '/.cache/morss'
|
cache = os.path.expanduser('~') + '/.cache/morss'
|
||||||
RSS = Gather(xml, cache)
|
RSS = Gather(url, cache)
|
||||||
|
else:
|
||||||
|
print "Please provide url."
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if SERVER or not os.getenv('DEBUG', False):
|
if 'REQUEST_URI' in os.environ or not os.getenv('DEBUG', False) and RSS is not False:
|
||||||
print RSS
|
print RSS
|
||||||
|
|
||||||
log('done')
|
log('done')
|
||||||
|
|
Loading…
Reference in New Issue