Another huge commit.

Now uses OOP where it fits. Atom feeds are supported, but no real tests were made. Unix globbing is now possible for urls. Caching is done a cleaner way. Feedburner links are also replaced. HTML is cleaned a more efficient way. Code is now much cleaner, using lxml.objectify and a small wrapper to access Atom feeds as if they were RSS feeds (and much faster than feedparser). README has been updated.
2013-04-15 18:51:55 +02:00
parent a098b7e104
commit af8879049f
4 changed files with 339 additions and 184 deletions
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem.
 This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed.
 morss also has experimental support for Atom feeds.
 ##(xpath) Rules
 To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites.
@@ -19,9 +21,12 @@ Here, xpath rules stored in the `rules` file. (The name of the file can be chang
 	Fancy name (description)(useless but not optional)
 	http://example.com/path/to/the/rss/feed.xml
 	http://example.co.uk/other/*/path/with/wildcard/*.xml
 	//super/accurate[@xpath='expression']/..
-Works like a charm with Tiny TinyRSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
+As shown in the example, multiple urls can be specified for a single rule, so as to be able to match feeds from different locations of the website server (for example with or without "www."). Moreover feeds urls can be *NIX glob-style patterns, so as to match any feed from a website.
 Works like a charm with Tiny Tiny RSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
 ###As a newsreader hook
@@ -51,4 +56,4 @@ Unwanted HTML elements are also stripped from the article. By default, elements
 ---
 GPL3 licence.
-Python **2.6** required (not 3).
+Python **2.6**+ required (not 3).
--- a/cache/.htaccess
+++ b/cache/.htaccess
@@ -1 +0,0 @@
 DefaultType text/html
--- a/morss.py
+++ b/morss.py
@@ -1,15 +1,42 @@
 #!/usr/bin/env python
 import sys
 import os
-from os.path import expanduser
+import copy
-from lxml import etree
+from base64 import b64encode, b64decode
 from fnmatch import fnmatch
 import os.path
 import lxml.etree
 import lxml.objectify
 import lxml.html
 import lxml.html.clean
 import lxml.builder
 import re
 import string
 import urllib2
 from cookielib import CookieJar
 import chardet
 # DISCLAIMER: feedparser is pure shit if you intend to *edit* the feed.
 SERVER = True
 MAX = 70
 TRASH = ['//h1', '//header']
 E = lxml.objectify.E
 ITEM_MAP = {
 	'link':		(('{http://www.w3.org/2005/Atom}link', 'href'),	'{}link'),
 	'desc':		('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 	'description':	('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 	'summary':	('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 	'content':	('{http://www.w3.org/2005/Atom}content',	'{http://purl.org/rss/1.0/modules/content/}encoded')
 	}
 RSS_MAP = {
 	'desc':		('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 	'description':	('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 	'subtitle':	('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 	'item':		('{http://www.w3.org/2005/Atom}entry',		'{}item'),
 	'entry':	('{http://www.w3.org/2005/Atom}entry',		'{}item')
 	}
 if SERVER:
 	import httplib
@@ -23,215 +50,317 @@ def log(txt):
 		print txt
 	if SERVER:
 		with open('morss.log', 'a') as file:
-			if isinstance(txt, str):
+			file.write(str(txt).encode('utf-8') + "\n")
 				file.write(txt.encode('utf-8') + "\n")
-class Info:
+def cleanXML(xml):
-	def __init__(self, item, feed):
+	table = string.maketrans('', '')
-		self.item = item
+	return xml.translate(table, table[:32]).lstrip()
 		self.feed = feed
-		self.data = False
+class Cache:
-		self.page = False
+	"""Light, error-prone caching system."""
-		self.html = False
+	def __init__(self, folder, key):
-		self.con = False
+		self._key = key
-		self.opener = False
+		self._dir = folder
-		self.enc = False
+		self._file = self._dir + "/" + str(hash(self._key))
 		self._cached = {} # what *was* cached
 		self._cache = {} # new things to put in cache
-		self.link = self.item.xpath('link')[0]
+		if os.path.exists(self._file):
-		self.desc = self.item.xpath('description')[0]
+			data = open(self._file).read().strip().split("\n")
 			for line in data:
 				key, bdata = line.split("\t")
 				self._cached[key] = bdata
-	def checkURL(self):
+		log(str(hash(self._key)))
 		if self.link.text.startswith("http://rss.feedsportal.com"):
 			log('feedsportal')
 			url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0')
 			t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'}
 			self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
 			log(self.link.text)
-	def fetch(self):
+	def get(self, key):
-		log(self.link.text)
+		if key in self._cached:
-		self.checkURL()
+			return b64decode(self._cached[key])
 		if not self.findCache():
 			self.download()
 			self.chardet()
 			self.fetchDesc()
 		self.save()
 		log(self.enc)
 	def parseHTML(self):
 		if self.enc is False:
 			self.page = etree.HTML(self.data)
 		else:
-			try:
+			return None
 				self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
 			except ValueError:
 				self.page = etree.HTML(self.data)
 	def save(self, key, content):
 		# Maybe, appending to file when adding new elements could be
 		# a good idea, but that'd require to check a couple of things,
 		# like whether it has aleardy been over-written (ie. whether
 		# it no longer contains self._cached)
-	def save(self):
+		self._cache[key] = b64encode(content)
 		self.feed.save()
-	def findCache(self):
+		txt = ""
-		if self.feed.cache is not False:
+		for (key, bdata) in self._cache.iteritems():
-			xpath = "//link[text()='" + self.link.text + "']/../description/text()"
+			txt += "\n" + str(key) + "\t" + bdata
-			match = self.feed.cache.xpath(xpath)
+		txt.strip()
 			if len(match):
 				log('cached')
 				self.desc.text = match[0]
 				return True
 		return False
-	def fetchDesc(self):
+		if not os.path.exists(self._dir):
-		self.parseHTML()
+			os.makedirs(self._dir)
-		match =	self.page.xpath(self.feed.rule)
+
-		if len(match):
+		open(self._file, 'w').write(txt)
-			self.html = match[0]
+
-			self.deleteTags()
+class XMLMap(object):
-			self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore')
+	"""
-			log('ok txt')
+	Sort of wrapper around lxml.objectify.StringElement (from which this
 	class *DOESN'T* inherit) which makes "links" between different children
 	of an element. For example, this allows cheap, efficient, transparent
 	RSS 2.0/Atom seamless use, which can be way faster than feedparser, and
 	has the advantage to edit the corresponding mapped fields. On top of
 	that, XML output with "classic" lxml API calls (such as
 	lxml.etree.tostring) is still possible. Element attributes are also
 	supported (as in <entry attr='value'/>).
 	However, keep in mind that this feature's support is only partial. For
 	example if you want to alias an element to both <el>value</el> and <el
 	href='value'/>, and put them as ('el', ('el', 'value')) in the _map
 	definition, then only 'el' will be whatched, even if ('el', 'value')
 	makes more sens in that specific case, because that would require to
 	also check the others, in case of "better" match, which is not done now.
 	Also, this class assumes there's some consistency in the _map
 	definition. Which means that it expects matches to be always found in
 	the same "column" in _map. This is useful when setting values which are
 	not yet in the XML tree. Indeed the class will try to use the alias from
 	the same column. With the RSS/Atom example, the default _map will always
 	create elements for the same kind of feed.
 	"""
 	def __init__(self, obj, alias=ITEM_MAP, string=False):
 		self._xml = obj
 		self._key = None
 		self._map = alias
 		self._str = string
 		self._guessKey()
 		self._E = E #lxml.objectify.ElementMaker(annotate=False)
 	def _guessKey(self):
 		for tag in self._map:
 			self._key = 0
 			for choice in self._map[tag]:
 				if not isinstance(choice, tuple):
 					choice = (choice, None)
 				el, attr = choice
 				if hasattr(self._xml, el):
 					if attr is None:
 						return
 					else:
-			log('no match')
+						if attr in self._xml[el].attrib:
 							return
 				self._key+=1
 		self._key = 0
-	def download(self):
+	def _getElement(self, tag):
 		"""Returns a tuple whatsoever."""
 		if tag in self._map:
 			for choice in self._map[tag]:
 				if not isinstance(choice, tuple):
 					choice = (choice, None)
 				el, attr = choice
 				if hasattr(self._xml, el):
 					if attr is None:
 						return (self._xml[el], attr)
 					else:
 						if attr in self._xml[el].attrib:
 							return (self._xml[el], attr)
 			return (None, None)
 		if hasattr(self._xml, tag):
 			return (self._xml[tag], None)
 		return (None, None)
 	def __getattr__(self, tag):
 		el, attr = self._getElement(tag)
 		if el is not None:
 			if attr is None:
 				out = el
 			else:
 				out = el.get(attr)
 		else:
 			out = self._xml.__getattr__(tag)
 		return unicode(out) if self._str else out
 	def __getitem__(self, tag):
 		return self.__getattr__(tag)
 	def __setattr__(self, tag, value):
 		if tag.startswith('_'):
 			return object.__setattr__(self, tag, value)
 		el, attr = self._getElement(tag)
 		if el is not None:
 			if attr is None:
 				if (isinstance(value, lxml.objectify.StringElement)
 					or isinstance(value, str)
 					or isinstance(value, unicode)):
 					el._setText(value)
 				else:
 					el = value
 				return
 			else:
 				el.set(attr, value)
 				return
 		choice = self._map[tag][self._key]
 		if not isinstance(choice, tuple):
 			child = lxml.objectify.Element(choice)
 			self._xml.append(child)
 			self._xml[choice] = value
 			return
 		else:
 			el, attr = choice
 			child = lxml.objectify.Element(choice, attrib={attr:value})
 			self._xml.append(child)
 			return
 	def __contains__(self, tag):
 		el, attr = self._getElement(tag)
 		return el is not None
 	def remove(self):
 		self._xml.getparent().remove(self._xml)
 	def tostring(self, **k):
 		"""Returns string using lxml. Arguments passed to tostring."""
 		out = self._xml if self._xml.getparent() is None else self._xml.getparent()
 		return lxml.etree.tostring(out, pretty_print=True, **k)
 def EncDownload(url):
 	try:
 		cj = CookieJar()
-			self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
+		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
-			self.con = self.opener.open(self.link.text.encode('utf-8'))
+		con = opener.open(url)
-			self.data = self.con.read()
+		data = con.read()
 	except (urllib2.HTTPError, urllib2.URLError) as error:
 		log(error)
 		log('http error')
 		return False
-	def chardet(self):
+	if con.headers.getparam('charset'):
 		if self.con.headers.getparam('charset'):
 		log('header')
-			self.enc = self.con.headers.getparam('charset')
+		enc = con.headers.getparam('charset')
 			return
 		page = etree.HTML(self.data)
 		header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
 		if len(header) and len(header[0].split("=")):
 			log('meta')
 			self.enc = header[0].split("=")[1]
 			return
 		header = page.xpath("//head/meta[@charset]/@charset")
 		if len(header):
 			log('meta2')
 			self.enc = header[0]
 			return
 		log('chardet')
 		self.enc = chardet.detect(self.data)['encoding']
 	def deleteTags(self):
 		for tag in self.feed.trash:
 			for elem in self.html.xpath(tag):
 				elem.getparent().remove(elem)
 class Feed:
 	def __init__(self, impl, data, cachePath):
 		self.rulePath = 'rules'
 		self.rule = '//article|//h1/..'
 		self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
 		self.max = 70
 		self.cachePath = cachePath
 		self.cacheFile = False
 		self.cache = False
 		self.impl = impl
 		self.items = []
 		self.rss = False
 		self.out = False
 		if self.impl == 'server':
 			self.url = data
 			self.xml = False
 	else:
-			self.url = False
+		match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data).groups()
-			self.xml = data
+		if len(match):
 			log('meta.re')
 			enc = match[0]
 		else:
 			log('chardet')
 			enc = chardet.detect(data)['encoding']
-	def save(self):
+	return (data, enc)
 		self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
 		open(self.cacheFile, 'w').write(self.out)
-	def getData(self):
+def parseRules(rulePath, url):
-		if self.impl == 'server':
+	rules = open(rulePath, "r").read().strip().split("\n\n")
 			req = urllib2.Request(self.url)
 			req.add_unredirected_header('User-Agent', '')
 			self.xml = urllib2.urlopen(req).read()
 		self.cleanXml()
 	def setCache(self):
 		if self.cache is not False:
 			return
 		self.parse()
 		key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
 		self.cacheFile = self.cachePath + "/" + key
 		log(self.cacheFile)
 		if not os.path.exists(self.cachePath):
 			os.makedirs(self.cachePath)
 		if os.path.exists(self.cacheFile):
 			self.cache = etree.XML(open(self.cacheFile, 'r').read())
 	def parse(self):
 		if self.rss is not False:
 			return
 		self.rss = etree.XML(self.xml)
 	def setItems(self):
 		self.items = [Info(e, self) for e in self.rss.xpath('//item')]
 		if self.max:
 			self.items = self.items[:self.max]
 	def fill(self):
 		self.parseRules()
 		log(self.rule)
 		for item in self.items:
 			item.fetch()
 	def cleanXml(self):
 		table = string.maketrans('', '')
 		self.xml = self.xml.translate(table, table[:32]).lstrip()
 	def parseRules(self):
 		if self.impl == 'server':
 			rules = open(self.rulePath, "r").read().split("\n\n")
 	rules = [r.split('\n') for r in rules]
 	for rule in rules:
-				if rule[1] == self.url:
+		for domain in rule[1:-1]:
-					self.rule = rule[2]
+			if fnmatch(url, domain):
-					return
+				return rule[-1]
 	return '//article|//h1/..'
 def Fill(rss, rule, cache):
 	item = XMLMap(rss, ITEM_MAP, True)
 	log(item.link)
 	# content already provided?
 	if 'content' in item:
 		if len(item.content) > 4*len(item.desc):
 			return item
 	# check link
 	if fnmatch(item.link, "http://*.feedsportal.com/*"):
 		url = re.search('/([0-9a-zA-Z]+)/[^/]+$', item.link).groups()[0].split('0')
 		t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.', 'O':'.co.uk'}
 		item.link = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
 	if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
 		item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
 	# check cache
 	cached = cache.get(item.link)
 	if cached is not None:
 		log('cached')
 		item.content = cached
 		return item
 	# download
 	ddl = EncDownload(item.link)
 	if ddl is False:
 		return item
 	data, enc = ddl
 	log(enc)
 	# parse
 	parser = lxml.html.HTMLParser(encoding=enc)
 	page = lxml.etree.fromstring(data, parser)
 	# filter
 	match =	page.xpath(rule)
 	if len(match):
 		art = match[0]
 		log('ok txt')
 	else:
 		log('no match')
 		return item
 	# clean
 	for tag in TRASH:
 		for elem in art.xpath(tag):
 			elem.getparent().remove(elem)
 	art.tag = 'div' # solves crash in lxml.html.clean
 	art = lxml.html.clean.clean_html(art)
 	out = lxml.etree.tostring(art, pretty_print=True).decode(enc, 'ignore')
 	item.content = out
 	cache.save(item.link, out)
 def Gather(data, cachePath):
 	# fetch feed
 	if data.startswith("http"):
 		req = urllib2.Request(data)
 		req.add_unredirected_header('User-Agent', '')
 		xml = urllib2.urlopen(req).read()
 	else:
 		xml = data
 	xml = cleanXML(xml)
 	rss = lxml.objectify.fromstring(xml)
 	root = rss.channel if hasattr(rss, 'channel') else rss
 	root = XMLMap(root, RSS_MAP)
 	cache = Cache(cachePath, unicode(root.title))
 	# rules
 	if data.startswith("http"):
 		rule = parseRules('rules', url)
 	else:
 		if len(sys.argv) > 1:
-				self.rule = sys.argv[1]
+			rule = sys.argv[1]
 		else:
 			rule = '//article|//h1/..'
 	# set
 	log(rule)
 	if MAX:
 		for item in root.item[MAX:]:
 			item.getparent().remove(item)
 	for item in root.item:
 		Fill(item, rule, cache)
 	return root.tostring(xml_declaration=True, encoding='UTF-8')
 if __name__ == "__main__":
 	if SERVER:
 		print 'Content-Type: text/html\n'
 		url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
 		url = 'http://' + url.replace(' ', '%20')
 		cache = os.getcwd() + '/cache'
 		log(url)
-		RSS = Feed('server', url, os.getcwd() + '/cache')
+		RSS = Gather(url, cache)
 	else:
 		xml = sys.stdin.read()
-		cache =	expanduser('~') + '/.cache/morss'
+		cache =	os.path.expanduser('~') + '/.cache/morss'
-		RSS = Feed('liferea', xml, os.getcwd() + '/cache')
+		RSS = Gather(xml, cache)
 	RSS.getData()
 	RSS.parse()
 	RSS.setCache()
 	RSS.setItems()
 	RSS.fill()
 	RSS.save()
 	if SERVER or not os.getenv('DEBUG', False):
-		print RSS.out
+		print RSS
-	else:
+
-		print 'done'
+	log('done')
--- a/30
+++ b/30
@@ -1,15 +1,37 @@
 TehranTimes
-http://www.tehrantimes.com/component/ninjarsssyndicator/?feed_id=1&format=raw
+http://www.tehrantimes.com/*
 http://tehrantimes.com/*
 //div[@class='article-indent']
 FranceInfo
-http://www.franceinfo.fr/rss.xml
+http://www.franceinfo.fr/rss*
 //h2[@class='chapo']/..
 Les Echos
 http://rss.feedsportal.com/c/499/f/413829/index.rss
 http://syndication.lesechos.fr/rss/*
 //h1/../..
 Spiegel
-http://www.spiegel.de/schlagzeilen/tops/index.rss
+http://www.spiegel.de/schlagzeilen/*
 //div[@id='spArticleSection']
 Le Soir
-http://www.lesoir.be/feed/La%20Une/destination_une_block/
+http://www.lesoir.be/feed/*
 //div[@class='article-content']
 Stack Overflow
 http://stackoverflow.com/feeds/*
 //*[@id='question']
 Daily Telegraph
 http://www.telegraph.co.uk/*
 //*[@id='mainBodyArea']
 Cracked.com
 http://feeds.feedburner.com/CrackedRSS
 //div[@class='content']|//section[@class='body']
 TheOnion
 http://feeds.theonion.com/*
 //article