Auto-detection of server-mode, better caching.

The SERVER variable is no longer needed. RSS .xml file is now cached for a very short time, so as to make loading faster, and hopefully reduce bann a little. Use a more common User-Agent to try to cut down bann. Added ability to test whether a key is in the Cache.
master
pictuga 2013-04-23 00:00:07 +02:00
parent a616c96e32
commit 1480bd7af4
1 changed files with 42 additions and 21 deletions

View File

@ -3,6 +3,7 @@ import sys
import os import os
from base64 import b64encode, b64decode from base64 import b64encode, b64decode
import os.path import os.path
import time
import lxml.etree import lxml.etree
import lxml.objectify import lxml.objectify
import lxml.html import lxml.html
@ -18,6 +19,7 @@ from readability import readability
SERVER = True SERVER = True
MAX = 70 MAX = 70
DELAY=10
ITEM_MAP = { ITEM_MAP = {
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'), 'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
@ -34,7 +36,7 @@ RSS_MAP = {
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item') 'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
} }
if SERVER: if 'REQUEST_URI' in os.environ:
import httplib import httplib
httplib.HTTPConnection.debuglevel = 1 httplib.HTTPConnection.debuglevel = 1
@ -42,9 +44,10 @@ if SERVER:
cgitb.enable() cgitb.enable()
def log(txt): def log(txt):
if not SERVER and os.getenv('DEBUG', False): if not 'REQUEST_URI' in os.environ:
print txt if os.getenv('DEBUG', False):
if SERVER: print txt
else:
with open('morss.log', 'a') as file: with open('morss.log', 'a') as file:
file.write(repr(txt).encode('utf-8') + "\n") file.write(repr(txt).encode('utf-8') + "\n")
@ -72,8 +75,12 @@ class Cache:
def __del__(self): def __del__(self):
self.save() self.save()
def __contains__(self, key):
return key in self._cached
def get(self, key): def get(self, key):
if key in self._cached: if key in self._cached:
self._cache[key] = self._cached[key]
return b64decode(self._cached[key]) return b64decode(self._cached[key])
else: else:
return None return None
@ -98,6 +105,12 @@ class Cache:
open(self._file, 'w').write(txt) open(self._file, 'w').write(txt)
def isYoungerThan(self, sec):
if not os.path.exists(self._file):
return False
return os.path.getmtime(self._file) > time.time()-sec
class XMLMap(object): class XMLMap(object):
""" """
Sort of wrapper around lxml.objectify.StringElement (from which this Sort of wrapper around lxml.objectify.StringElement (from which this
@ -277,11 +290,9 @@ def Fill(rss, cache):
log(item.link) log(item.link)
# check cache # check cache
cached = cache.get(item.link) if item.link in cache:
if cached is not None:
log('cached') log('cached')
item.content = cached item.content = cache.get(item.link)
cache.set(item.link, cached)
return item return item
# download # download
@ -298,22 +309,28 @@ def Fill(rss, cache):
item.content = out item.content = out
cache.set(item.link, out) cache.set(item.link, out)
def Gather(data, cachePath): def Gather(url, cachePath):
cache = Cache(cachePath, url)
# fetch feed # fetch feed
if data.startswith("http"): if cache.isYoungerThan(DELAY*60) and url in cache:
req = urllib2.Request(data) log('xml cached')
req.add_unredirected_header('User-Agent', '') xml = cache.get(url)
xml = urllib2.urlopen(req).read()
else: else:
xml = data try:
req = urllib2.Request(url)
req.add_unredirected_header('User-Agent', 'Liferea/1.8.12 (Linux; fr_FR.utf8; http://liferea.sf.net/)')
xml = urllib2.urlopen(req).read()
cache.set(url, xml)
except (urllib2.HTTPError, urllib2.URLError):
print "Error, couldn't fetch RSS feed (the server might be banned from the given website)."
return False
xml = cleanXML(xml) xml = cleanXML(xml)
rss = lxml.objectify.fromstring(xml) rss = lxml.objectify.fromstring(xml)
root = rss.channel if hasattr(rss, 'channel') else rss root = rss.channel if hasattr(rss, 'channel') else rss
root = XMLMap(root, RSS_MAP) root = XMLMap(root, RSS_MAP)
cache = Cache(cachePath, unicode(root.title))
# set # set
if MAX: if MAX:
for item in root.item[MAX:]: for item in root.item[MAX:]:
@ -324,7 +341,7 @@ def Gather(data, cachePath):
return root.tostring(xml_declaration=True, encoding='UTF-8') return root.tostring(xml_declaration=True, encoding='UTF-8')
if __name__ == "__main__": if __name__ == "__main__":
if SERVER: if 'REQUEST_URI' in os.environ:
print 'Status: 200' print 'Status: 200'
print 'Content-Type: text/html\n' print 'Content-Type: text/html\n'
@ -340,11 +357,15 @@ if __name__ == "__main__":
log(url) log(url)
RSS = Gather(url, cache) RSS = Gather(url, cache)
else: else:
xml = sys.stdin.read() if len(sys.argv) > 1 and sys.argv[1].startswith('http'):
cache = os.path.expanduser('~') + '/.cache/morss' url = sys.argv[1]
RSS = Gather(xml, cache) cache = os.path.expanduser('~') + '/.cache/morss'
RSS = Gather(url, cache)
else:
print "Please provide url."
sys.exit(1)
if SERVER or not os.getenv('DEBUG', False): if 'REQUEST_URI' in os.environ or not os.getenv('DEBUG', False) and RSS is not False:
print RSS print RSS
log('done') log('done')