Another huge commit.
Now uses OOP where it fits. Atom feeds are supported, but no real tests were made. Unix globbing is now possible for urls. Caching is done a cleaner way. Feedburner links are also replaced. HTML is cleaned a more efficient way. Code is now much cleaner, using lxml.objectify and a small wrapper to access Atom feeds as if they were RSS feeds (and much faster than feedparser). README has been updated.
This commit is contained in:
		@@ -3,6 +3,8 @@
 | 
			
		||||
This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem.
 | 
			
		||||
This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed.
 | 
			
		||||
 | 
			
		||||
morss also has experimental support for Atom feeds.
 | 
			
		||||
 | 
			
		||||
##(xpath) Rules
 | 
			
		||||
 | 
			
		||||
To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites.
 | 
			
		||||
@@ -19,9 +21,12 @@ Here, xpath rules stored in the `rules` file. (The name of the file can be chang
 | 
			
		||||
 | 
			
		||||
	Fancy name (description)(useless but not optional)
 | 
			
		||||
	http://example.com/path/to/the/rss/feed.xml
 | 
			
		||||
	http://example.co.uk/other/*/path/with/wildcard/*.xml
 | 
			
		||||
	//super/accurate[@xpath='expression']/..
 | 
			
		||||
 | 
			
		||||
Works like a charm with Tiny TinyRSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
 | 
			
		||||
As shown in the example, multiple urls can be specified for a single rule, so as to be able to match feeds from different locations of the website server (for example with or without "www."). Moreover feeds urls can be *NIX glob-style patterns, so as to match any feed from a website.
 | 
			
		||||
 | 
			
		||||
Works like a charm with Tiny Tiny RSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
 | 
			
		||||
 | 
			
		||||
###As a newsreader hook
 | 
			
		||||
 | 
			
		||||
@@ -51,4 +56,4 @@ Unwanted HTML elements are also stripped from the article. By default, elements
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
GPL3 licence.
 | 
			
		||||
Python **2.6** required (not 3).
 | 
			
		||||
Python **2.6**+ required (not 3).
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								cache/.htaccess
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								cache/.htaccess
									
									
									
									
										vendored
									
									
								
							@@ -1 +0,0 @@
 | 
			
		||||
DefaultType text/html
 | 
			
		||||
							
								
								
									
										483
									
								
								morss.py
									
									
									
									
									
								
							
							
						
						
									
										483
									
								
								morss.py
									
									
									
									
									
								
							@@ -1,15 +1,42 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
import sys
 | 
			
		||||
import os
 | 
			
		||||
from os.path import expanduser
 | 
			
		||||
from lxml import etree
 | 
			
		||||
import copy
 | 
			
		||||
from base64 import b64encode, b64decode
 | 
			
		||||
from fnmatch import fnmatch
 | 
			
		||||
import os.path
 | 
			
		||||
import lxml.etree
 | 
			
		||||
import lxml.objectify
 | 
			
		||||
import lxml.html
 | 
			
		||||
import lxml.html.clean
 | 
			
		||||
import lxml.builder
 | 
			
		||||
import re
 | 
			
		||||
import string
 | 
			
		||||
import urllib2
 | 
			
		||||
from cookielib import CookieJar
 | 
			
		||||
import chardet
 | 
			
		||||
 | 
			
		||||
# DISCLAIMER: feedparser is pure shit if you intend to *edit* the feed.
 | 
			
		||||
 | 
			
		||||
SERVER = True
 | 
			
		||||
MAX = 70
 | 
			
		||||
TRASH = ['//h1', '//header']
 | 
			
		||||
E = lxml.objectify.E
 | 
			
		||||
 | 
			
		||||
ITEM_MAP = {
 | 
			
		||||
	'link':		(('{http://www.w3.org/2005/Atom}link', 'href'),	'{}link'),
 | 
			
		||||
	'desc':		('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 | 
			
		||||
	'description':	('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 | 
			
		||||
	'summary':	('{http://www.w3.org/2005/Atom}summary',	'{}description'),
 | 
			
		||||
	'content':	('{http://www.w3.org/2005/Atom}content',	'{http://purl.org/rss/1.0/modules/content/}encoded')
 | 
			
		||||
	}
 | 
			
		||||
RSS_MAP = {
 | 
			
		||||
	'desc':		('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 | 
			
		||||
	'description':	('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 | 
			
		||||
	'subtitle':	('{http://www.w3.org/2005/Atom}subtitle',	'{}description'),
 | 
			
		||||
	'item':		('{http://www.w3.org/2005/Atom}entry',		'{}item'),
 | 
			
		||||
	'entry':	('{http://www.w3.org/2005/Atom}entry',		'{}item')
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
if SERVER:
 | 
			
		||||
	import httplib
 | 
			
		||||
@@ -23,215 +50,317 @@ def log(txt):
 | 
			
		||||
		print txt
 | 
			
		||||
	if SERVER:
 | 
			
		||||
		with open('morss.log', 'a') as file:
 | 
			
		||||
			if isinstance(txt, str):
 | 
			
		||||
				file.write(txt.encode('utf-8') + "\n")
 | 
			
		||||
			file.write(str(txt).encode('utf-8') + "\n")
 | 
			
		||||
 | 
			
		||||
class Info:
 | 
			
		||||
	def __init__(self, item, feed):
 | 
			
		||||
		self.item = item
 | 
			
		||||
		self.feed = feed
 | 
			
		||||
def cleanXML(xml):
 | 
			
		||||
	table = string.maketrans('', '')
 | 
			
		||||
	return xml.translate(table, table[:32]).lstrip()
 | 
			
		||||
 | 
			
		||||
		self.data = False
 | 
			
		||||
		self.page = False
 | 
			
		||||
		self.html = False
 | 
			
		||||
		self.con = False
 | 
			
		||||
		self.opener = False
 | 
			
		||||
		self.enc = False
 | 
			
		||||
class Cache:
 | 
			
		||||
	"""Light, error-prone caching system."""
 | 
			
		||||
	def __init__(self, folder, key):
 | 
			
		||||
		self._key = key
 | 
			
		||||
		self._dir = folder
 | 
			
		||||
		self._file = self._dir + "/" + str(hash(self._key))
 | 
			
		||||
		self._cached = {} # what *was* cached
 | 
			
		||||
		self._cache = {} # new things to put in cache
 | 
			
		||||
 | 
			
		||||
		self.link = self.item.xpath('link')[0]
 | 
			
		||||
		self.desc = self.item.xpath('description')[0]
 | 
			
		||||
		if os.path.exists(self._file):
 | 
			
		||||
			data = open(self._file).read().strip().split("\n")
 | 
			
		||||
			for line in data:
 | 
			
		||||
				key, bdata = line.split("\t")
 | 
			
		||||
				self._cached[key] = bdata
 | 
			
		||||
 | 
			
		||||
	def checkURL(self):
 | 
			
		||||
		if self.link.text.startswith("http://rss.feedsportal.com"):
 | 
			
		||||
			log('feedsportal')
 | 
			
		||||
			url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0')
 | 
			
		||||
			t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'}
 | 
			
		||||
			self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
 | 
			
		||||
			log(self.link.text)
 | 
			
		||||
		log(str(hash(self._key)))
 | 
			
		||||
 | 
			
		||||
	def fetch(self):
 | 
			
		||||
		log(self.link.text)
 | 
			
		||||
		self.checkURL()
 | 
			
		||||
		if not self.findCache():
 | 
			
		||||
			self.download()
 | 
			
		||||
			self.chardet()
 | 
			
		||||
			self.fetchDesc()
 | 
			
		||||
		self.save()
 | 
			
		||||
		log(self.enc)
 | 
			
		||||
 | 
			
		||||
	def parseHTML(self):
 | 
			
		||||
		if self.enc is False:
 | 
			
		||||
			self.page = etree.HTML(self.data)
 | 
			
		||||
	def get(self, key):
 | 
			
		||||
		if key in self._cached:
 | 
			
		||||
			return b64decode(self._cached[key])
 | 
			
		||||
		else:
 | 
			
		||||
			try:
 | 
			
		||||
				self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
 | 
			
		||||
			except ValueError:
 | 
			
		||||
				self.page = etree.HTML(self.data)
 | 
			
		||||
			return None
 | 
			
		||||
 | 
			
		||||
	def save(self, key, content):
 | 
			
		||||
		# Maybe, appending to file when adding new elements could be
 | 
			
		||||
		# a good idea, but that'd require to check a couple of things,
 | 
			
		||||
		# like whether it has aleardy been over-written (ie. whether
 | 
			
		||||
		# it no longer contains self._cached)
 | 
			
		||||
 | 
			
		||||
	def save(self):
 | 
			
		||||
		self.feed.save()
 | 
			
		||||
		self._cache[key] = b64encode(content)
 | 
			
		||||
 | 
			
		||||
	def findCache(self):
 | 
			
		||||
		if self.feed.cache is not False:
 | 
			
		||||
			xpath = "//link[text()='" + self.link.text + "']/../description/text()"
 | 
			
		||||
			match = self.feed.cache.xpath(xpath)
 | 
			
		||||
			if len(match):
 | 
			
		||||
				log('cached')
 | 
			
		||||
				self.desc.text = match[0]
 | 
			
		||||
				return True
 | 
			
		||||
		txt = ""
 | 
			
		||||
		for (key, bdata) in self._cache.iteritems():
 | 
			
		||||
			txt += "\n" + str(key) + "\t" + bdata
 | 
			
		||||
		txt.strip()
 | 
			
		||||
 | 
			
		||||
		if not os.path.exists(self._dir):
 | 
			
		||||
			os.makedirs(self._dir)
 | 
			
		||||
 | 
			
		||||
		open(self._file, 'w').write(txt)
 | 
			
		||||
 | 
			
		||||
class XMLMap(object):
 | 
			
		||||
	"""
 | 
			
		||||
	Sort of wrapper around lxml.objectify.StringElement (from which this
 | 
			
		||||
	class *DOESN'T* inherit) which makes "links" between different children
 | 
			
		||||
	of an element. For example, this allows cheap, efficient, transparent
 | 
			
		||||
	RSS 2.0/Atom seamless use, which can be way faster than feedparser, and
 | 
			
		||||
	has the advantage to edit the corresponding mapped fields. On top of
 | 
			
		||||
	that, XML output with "classic" lxml API calls (such as
 | 
			
		||||
	lxml.etree.tostring) is still possible. Element attributes are also
 | 
			
		||||
	supported (as in <entry attr='value'/>).
 | 
			
		||||
 | 
			
		||||
	However, keep in mind that this feature's support is only partial. For
 | 
			
		||||
	example if you want to alias an element to both <el>value</el> and <el
 | 
			
		||||
	href='value'/>, and put them as ('el', ('el', 'value')) in the _map
 | 
			
		||||
	definition, then only 'el' will be whatched, even if ('el', 'value')
 | 
			
		||||
	makes more sens in that specific case, because that would require to
 | 
			
		||||
	also check the others, in case of "better" match, which is not done now.
 | 
			
		||||
 | 
			
		||||
	Also, this class assumes there's some consistency in the _map
 | 
			
		||||
	definition. Which means that it expects matches to be always found in
 | 
			
		||||
	the same "column" in _map. This is useful when setting values which are
 | 
			
		||||
	not yet in the XML tree. Indeed the class will try to use the alias from
 | 
			
		||||
	the same column. With the RSS/Atom example, the default _map will always
 | 
			
		||||
	create elements for the same kind of feed.
 | 
			
		||||
	"""
 | 
			
		||||
	def __init__(self, obj, alias=ITEM_MAP, string=False):
 | 
			
		||||
		self._xml = obj
 | 
			
		||||
		self._key = None
 | 
			
		||||
		self._map = alias
 | 
			
		||||
		self._str = string
 | 
			
		||||
 | 
			
		||||
		self._guessKey()
 | 
			
		||||
		self._E = E #lxml.objectify.ElementMaker(annotate=False)
 | 
			
		||||
 | 
			
		||||
	def _guessKey(self):
 | 
			
		||||
		for tag in self._map:
 | 
			
		||||
			self._key = 0
 | 
			
		||||
			for choice in self._map[tag]:
 | 
			
		||||
				if not isinstance(choice, tuple):
 | 
			
		||||
					choice = (choice, None)
 | 
			
		||||
				el, attr = choice
 | 
			
		||||
				if hasattr(self._xml, el):
 | 
			
		||||
					if attr is None:
 | 
			
		||||
						return
 | 
			
		||||
					else:
 | 
			
		||||
						if attr in self._xml[el].attrib:
 | 
			
		||||
							return
 | 
			
		||||
				self._key+=1
 | 
			
		||||
		self._key = 0
 | 
			
		||||
 | 
			
		||||
	def _getElement(self, tag):
 | 
			
		||||
		"""Returns a tuple whatsoever."""
 | 
			
		||||
		if tag in self._map:
 | 
			
		||||
			for choice in self._map[tag]:
 | 
			
		||||
				if not isinstance(choice, tuple):
 | 
			
		||||
					choice = (choice, None)
 | 
			
		||||
				el, attr = choice
 | 
			
		||||
				if hasattr(self._xml, el):
 | 
			
		||||
					if attr is None:
 | 
			
		||||
						return (self._xml[el], attr)
 | 
			
		||||
					else:
 | 
			
		||||
						if attr in self._xml[el].attrib:
 | 
			
		||||
							return (self._xml[el], attr)
 | 
			
		||||
			return (None, None)
 | 
			
		||||
		if hasattr(self._xml, tag):
 | 
			
		||||
			return (self._xml[tag], None)
 | 
			
		||||
		return (None, None)
 | 
			
		||||
 | 
			
		||||
	def __getattr__(self, tag):
 | 
			
		||||
		el, attr = self._getElement(tag)
 | 
			
		||||
		if el is not None:
 | 
			
		||||
			if attr is None:
 | 
			
		||||
				out = el
 | 
			
		||||
			else:
 | 
			
		||||
				out = el.get(attr)
 | 
			
		||||
		else:
 | 
			
		||||
			out = self._xml.__getattr__(tag)
 | 
			
		||||
 | 
			
		||||
		return unicode(out) if self._str else out
 | 
			
		||||
 | 
			
		||||
	def __getitem__(self, tag):
 | 
			
		||||
		return self.__getattr__(tag)
 | 
			
		||||
 | 
			
		||||
	def __setattr__(self, tag, value):
 | 
			
		||||
		if tag.startswith('_'):
 | 
			
		||||
			return object.__setattr__(self, tag, value)
 | 
			
		||||
 | 
			
		||||
		el, attr = self._getElement(tag)
 | 
			
		||||
		if el is not None:
 | 
			
		||||
			if attr is None:
 | 
			
		||||
				if (isinstance(value, lxml.objectify.StringElement)
 | 
			
		||||
					or isinstance(value, str)
 | 
			
		||||
					or isinstance(value, unicode)):
 | 
			
		||||
					el._setText(value)
 | 
			
		||||
				else:
 | 
			
		||||
					el = value
 | 
			
		||||
				return
 | 
			
		||||
			else:
 | 
			
		||||
				el.set(attr, value)
 | 
			
		||||
				return
 | 
			
		||||
		choice = self._map[tag][self._key]
 | 
			
		||||
		if not isinstance(choice, tuple):
 | 
			
		||||
			child = lxml.objectify.Element(choice)
 | 
			
		||||
			self._xml.append(child)
 | 
			
		||||
			self._xml[choice] = value
 | 
			
		||||
			return
 | 
			
		||||
		else:
 | 
			
		||||
			el, attr = choice
 | 
			
		||||
			child = lxml.objectify.Element(choice, attrib={attr:value})
 | 
			
		||||
			self._xml.append(child)
 | 
			
		||||
			return
 | 
			
		||||
 | 
			
		||||
	def __contains__(self, tag):
 | 
			
		||||
		el, attr = self._getElement(tag)
 | 
			
		||||
		return el is not None
 | 
			
		||||
 | 
			
		||||
	def remove(self):
 | 
			
		||||
		self._xml.getparent().remove(self._xml)
 | 
			
		||||
 | 
			
		||||
	def tostring(self, **k):
 | 
			
		||||
		"""Returns string using lxml. Arguments passed to tostring."""
 | 
			
		||||
		out = self._xml if self._xml.getparent() is None else self._xml.getparent()
 | 
			
		||||
		return lxml.etree.tostring(out, pretty_print=True, **k)
 | 
			
		||||
 | 
			
		||||
def EncDownload(url):
 | 
			
		||||
	try:
 | 
			
		||||
		cj = CookieJar()
 | 
			
		||||
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 | 
			
		||||
		con = opener.open(url)
 | 
			
		||||
		data = con.read()
 | 
			
		||||
	except (urllib2.HTTPError, urllib2.URLError) as error:
 | 
			
		||||
		log(error)
 | 
			
		||||
		log('http error')
 | 
			
		||||
		return False
 | 
			
		||||
 | 
			
		||||
	def fetchDesc(self):
 | 
			
		||||
		self.parseHTML()
 | 
			
		||||
		match =	self.page.xpath(self.feed.rule)
 | 
			
		||||
	if con.headers.getparam('charset'):
 | 
			
		||||
		log('header')
 | 
			
		||||
		enc = con.headers.getparam('charset')
 | 
			
		||||
	else:
 | 
			
		||||
		match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data).groups()
 | 
			
		||||
		if len(match):
 | 
			
		||||
			self.html = match[0]
 | 
			
		||||
			self.deleteTags()
 | 
			
		||||
			self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore')
 | 
			
		||||
			log('ok txt')
 | 
			
		||||
			log('meta.re')
 | 
			
		||||
			enc = match[0]
 | 
			
		||||
		else:
 | 
			
		||||
			log('no match')
 | 
			
		||||
			log('chardet')
 | 
			
		||||
			enc = chardet.detect(data)['encoding']
 | 
			
		||||
 | 
			
		||||
	def download(self):
 | 
			
		||||
		try:
 | 
			
		||||
			cj = CookieJar()
 | 
			
		||||
			self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 | 
			
		||||
			self.con = self.opener.open(self.link.text.encode('utf-8'))
 | 
			
		||||
			self.data = self.con.read()
 | 
			
		||||
		except (urllib2.HTTPError, urllib2.URLError) as error:
 | 
			
		||||
			log(error)
 | 
			
		||||
			log('http error')
 | 
			
		||||
	return (data, enc)
 | 
			
		||||
 | 
			
		||||
	def chardet(self):
 | 
			
		||||
		if self.con.headers.getparam('charset'):
 | 
			
		||||
			log('header')
 | 
			
		||||
			self.enc = self.con.headers.getparam('charset')
 | 
			
		||||
			return
 | 
			
		||||
def parseRules(rulePath, url):
 | 
			
		||||
	rules = open(rulePath, "r").read().strip().split("\n\n")
 | 
			
		||||
	rules = [r.split('\n') for r in rules]
 | 
			
		||||
	for rule in rules:
 | 
			
		||||
		for domain in rule[1:-1]:
 | 
			
		||||
			if fnmatch(url, domain):
 | 
			
		||||
				return rule[-1]
 | 
			
		||||
	return '//article|//h1/..'
 | 
			
		||||
 | 
			
		||||
		page = etree.HTML(self.data)
 | 
			
		||||
		header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
 | 
			
		||||
		if len(header) and len(header[0].split("=")):
 | 
			
		||||
			log('meta')
 | 
			
		||||
			self.enc = header[0].split("=")[1]
 | 
			
		||||
			return
 | 
			
		||||
def Fill(rss, rule, cache):
 | 
			
		||||
	item = XMLMap(rss, ITEM_MAP, True)
 | 
			
		||||
	log(item.link)
 | 
			
		||||
 | 
			
		||||
		header = page.xpath("//head/meta[@charset]/@charset")
 | 
			
		||||
		if len(header):
 | 
			
		||||
			log('meta2')
 | 
			
		||||
			self.enc = header[0]
 | 
			
		||||
			return
 | 
			
		||||
	# content already provided?
 | 
			
		||||
	if 'content' in item:
 | 
			
		||||
		if len(item.content) > 4*len(item.desc):
 | 
			
		||||
			return item
 | 
			
		||||
 | 
			
		||||
		log('chardet')
 | 
			
		||||
		self.enc = chardet.detect(self.data)['encoding']
 | 
			
		||||
	# check link
 | 
			
		||||
	if fnmatch(item.link, "http://*.feedsportal.com/*"):
 | 
			
		||||
		url = re.search('/([0-9a-zA-Z]+)/[^/]+$', item.link).groups()[0].split('0')
 | 
			
		||||
		t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.', 'O':'.co.uk'}
 | 
			
		||||
		item.link = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
 | 
			
		||||
	if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
 | 
			
		||||
		item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
 | 
			
		||||
 | 
			
		||||
	def deleteTags(self):
 | 
			
		||||
		for tag in self.feed.trash:
 | 
			
		||||
			for elem in self.html.xpath(tag):
 | 
			
		||||
				elem.getparent().remove(elem)
 | 
			
		||||
	# check cache
 | 
			
		||||
	cached = cache.get(item.link)
 | 
			
		||||
	if cached is not None:
 | 
			
		||||
		log('cached')
 | 
			
		||||
		item.content = cached
 | 
			
		||||
		return item
 | 
			
		||||
 | 
			
		||||
class Feed:
 | 
			
		||||
	def __init__(self, impl, data, cachePath):
 | 
			
		||||
		self.rulePath = 'rules'
 | 
			
		||||
		self.rule = '//article|//h1/..'
 | 
			
		||||
	# download
 | 
			
		||||
	ddl = EncDownload(item.link)
 | 
			
		||||
 | 
			
		||||
		self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
 | 
			
		||||
		self.max = 70
 | 
			
		||||
	if ddl is False:
 | 
			
		||||
		return item
 | 
			
		||||
 | 
			
		||||
		self.cachePath = cachePath
 | 
			
		||||
		self.cacheFile = False
 | 
			
		||||
		self.cache = False
 | 
			
		||||
		self.impl = impl
 | 
			
		||||
	data, enc = ddl
 | 
			
		||||
	log(enc)
 | 
			
		||||
 | 
			
		||||
		self.items = []
 | 
			
		||||
		self.rss = False
 | 
			
		||||
		self.out = False
 | 
			
		||||
	# parse
 | 
			
		||||
	parser = lxml.html.HTMLParser(encoding=enc)
 | 
			
		||||
	page = lxml.etree.fromstring(data, parser)
 | 
			
		||||
 | 
			
		||||
		if self.impl == 'server':
 | 
			
		||||
			self.url = data
 | 
			
		||||
			self.xml = False
 | 
			
		||||
	# filter
 | 
			
		||||
	match =	page.xpath(rule)
 | 
			
		||||
	if len(match):
 | 
			
		||||
		art = match[0]
 | 
			
		||||
		log('ok txt')
 | 
			
		||||
	else:
 | 
			
		||||
		log('no match')
 | 
			
		||||
		return item
 | 
			
		||||
 | 
			
		||||
	# clean
 | 
			
		||||
	for tag in TRASH:
 | 
			
		||||
		for elem in art.xpath(tag):
 | 
			
		||||
			elem.getparent().remove(elem)
 | 
			
		||||
 | 
			
		||||
	art.tag = 'div' # solves crash in lxml.html.clean
 | 
			
		||||
	art = lxml.html.clean.clean_html(art)
 | 
			
		||||
	out = lxml.etree.tostring(art, pretty_print=True).decode(enc, 'ignore')
 | 
			
		||||
	item.content = out
 | 
			
		||||
	cache.save(item.link, out)
 | 
			
		||||
 | 
			
		||||
def Gather(data, cachePath):
 | 
			
		||||
	# fetch feed
 | 
			
		||||
	if data.startswith("http"):
 | 
			
		||||
		req = urllib2.Request(data)
 | 
			
		||||
		req.add_unredirected_header('User-Agent', '')
 | 
			
		||||
		xml = urllib2.urlopen(req).read()
 | 
			
		||||
	else:
 | 
			
		||||
		xml = data
 | 
			
		||||
 | 
			
		||||
	xml = cleanXML(xml)
 | 
			
		||||
	rss = lxml.objectify.fromstring(xml)
 | 
			
		||||
	root = rss.channel if hasattr(rss, 'channel') else rss
 | 
			
		||||
	root = XMLMap(root, RSS_MAP)
 | 
			
		||||
 | 
			
		||||
	cache = Cache(cachePath, unicode(root.title))
 | 
			
		||||
 | 
			
		||||
	# rules
 | 
			
		||||
	if data.startswith("http"):
 | 
			
		||||
		rule = parseRules('rules', url)
 | 
			
		||||
	else:
 | 
			
		||||
		if len(sys.argv) > 1:
 | 
			
		||||
			rule = sys.argv[1]
 | 
			
		||||
		else:
 | 
			
		||||
			self.url = False
 | 
			
		||||
			self.xml = data
 | 
			
		||||
			rule = '//article|//h1/..'
 | 
			
		||||
 | 
			
		||||
	def save(self):
 | 
			
		||||
		self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
 | 
			
		||||
		open(self.cacheFile, 'w').write(self.out)
 | 
			
		||||
	# set
 | 
			
		||||
	log(rule)
 | 
			
		||||
	if MAX:
 | 
			
		||||
		for item in root.item[MAX:]:
 | 
			
		||||
			item.getparent().remove(item)
 | 
			
		||||
	for item in root.item:
 | 
			
		||||
		Fill(item, rule, cache)
 | 
			
		||||
 | 
			
		||||
	def getData(self):
 | 
			
		||||
		if self.impl == 'server':
 | 
			
		||||
			req = urllib2.Request(self.url)
 | 
			
		||||
			req.add_unredirected_header('User-Agent', '')
 | 
			
		||||
			self.xml = urllib2.urlopen(req).read()
 | 
			
		||||
		self.cleanXml()
 | 
			
		||||
 | 
			
		||||
	def setCache(self):
 | 
			
		||||
		if self.cache is not False:
 | 
			
		||||
			return
 | 
			
		||||
 | 
			
		||||
		self.parse()
 | 
			
		||||
		key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
 | 
			
		||||
		self.cacheFile = self.cachePath + "/" + key
 | 
			
		||||
		log(self.cacheFile)
 | 
			
		||||
		if not os.path.exists(self.cachePath):
 | 
			
		||||
			os.makedirs(self.cachePath)
 | 
			
		||||
 | 
			
		||||
		if os.path.exists(self.cacheFile):
 | 
			
		||||
			self.cache = etree.XML(open(self.cacheFile, 'r').read())
 | 
			
		||||
 | 
			
		||||
	def parse(self):
 | 
			
		||||
		if self.rss is not False:
 | 
			
		||||
			return
 | 
			
		||||
 | 
			
		||||
		self.rss = etree.XML(self.xml)
 | 
			
		||||
 | 
			
		||||
	def setItems(self):
 | 
			
		||||
		self.items = [Info(e, self) for e in self.rss.xpath('//item')]
 | 
			
		||||
		if self.max:
 | 
			
		||||
			self.items = self.items[:self.max]
 | 
			
		||||
 | 
			
		||||
	def fill(self):
 | 
			
		||||
		self.parseRules()
 | 
			
		||||
		log(self.rule)
 | 
			
		||||
		for item in self.items:
 | 
			
		||||
			item.fetch()
 | 
			
		||||
 | 
			
		||||
	def cleanXml(self):
 | 
			
		||||
		table = string.maketrans('', '')
 | 
			
		||||
		self.xml = self.xml.translate(table, table[:32]).lstrip()
 | 
			
		||||
 | 
			
		||||
	def parseRules(self):
 | 
			
		||||
		if self.impl == 'server':
 | 
			
		||||
			rules = open(self.rulePath, "r").read().split("\n\n")
 | 
			
		||||
			rules = [r.split('\n') for r in rules]
 | 
			
		||||
			for rule in rules:
 | 
			
		||||
				if rule[1] == self.url:
 | 
			
		||||
					self.rule = rule[2]
 | 
			
		||||
					return
 | 
			
		||||
		else:
 | 
			
		||||
			if len(sys.argv) > 1:
 | 
			
		||||
				self.rule = sys.argv[1]
 | 
			
		||||
	return root.tostring(xml_declaration=True, encoding='UTF-8')
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
	if SERVER:
 | 
			
		||||
		print 'Content-Type: text/html\n'
 | 
			
		||||
		url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
 | 
			
		||||
		url = 'http://' + url.replace(' ', '%20')
 | 
			
		||||
		cache = os.getcwd() + '/cache'
 | 
			
		||||
		log(url)
 | 
			
		||||
		RSS = Feed('server', url, os.getcwd() + '/cache')
 | 
			
		||||
		RSS = Gather(url, cache)
 | 
			
		||||
	else:
 | 
			
		||||
		xml =	sys.stdin.read()
 | 
			
		||||
		cache =	expanduser('~') + '/.cache/morss'
 | 
			
		||||
		RSS = Feed('liferea', xml, os.getcwd() + '/cache')
 | 
			
		||||
 | 
			
		||||
	RSS.getData()
 | 
			
		||||
	RSS.parse()
 | 
			
		||||
	RSS.setCache()
 | 
			
		||||
	RSS.setItems()
 | 
			
		||||
	RSS.fill()
 | 
			
		||||
	RSS.save()
 | 
			
		||||
		xml = sys.stdin.read()
 | 
			
		||||
		cache =	os.path.expanduser('~') + '/.cache/morss'
 | 
			
		||||
		RSS = Gather(xml, cache)
 | 
			
		||||
 | 
			
		||||
	if SERVER or not os.getenv('DEBUG', False):
 | 
			
		||||
		print RSS.out
 | 
			
		||||
	else:
 | 
			
		||||
		print 'done'
 | 
			
		||||
		print RSS
 | 
			
		||||
 | 
			
		||||
	log('done')
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										30
									
								
								rules
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								rules
									
									
									
									
									
								
							@@ -1,15 +1,37 @@
 | 
			
		||||
TehranTimes
 | 
			
		||||
http://www.tehrantimes.com/component/ninjarsssyndicator/?feed_id=1&format=raw
 | 
			
		||||
http://www.tehrantimes.com/*
 | 
			
		||||
http://tehrantimes.com/*
 | 
			
		||||
//div[@class='article-indent']
 | 
			
		||||
 | 
			
		||||
FranceInfo
 | 
			
		||||
http://www.franceinfo.fr/rss.xml
 | 
			
		||||
http://www.franceinfo.fr/rss*
 | 
			
		||||
//h2[@class='chapo']/..
 | 
			
		||||
 | 
			
		||||
Les Echos
 | 
			
		||||
http://rss.feedsportal.com/c/499/f/413829/index.rss
 | 
			
		||||
http://syndication.lesechos.fr/rss/*
 | 
			
		||||
//h1/../..
 | 
			
		||||
 | 
			
		||||
Spiegel
 | 
			
		||||
http://www.spiegel.de/schlagzeilen/tops/index.rss
 | 
			
		||||
http://www.spiegel.de/schlagzeilen/*
 | 
			
		||||
//div[@id='spArticleSection']
 | 
			
		||||
 | 
			
		||||
Le Soir
 | 
			
		||||
http://www.lesoir.be/feed/La%20Une/destination_une_block/
 | 
			
		||||
http://www.lesoir.be/feed/*
 | 
			
		||||
//div[@class='article-content']
 | 
			
		||||
 | 
			
		||||
Stack Overflow
 | 
			
		||||
http://stackoverflow.com/feeds/*
 | 
			
		||||
//*[@id='question']
 | 
			
		||||
 | 
			
		||||
Daily Telegraph
 | 
			
		||||
http://www.telegraph.co.uk/*
 | 
			
		||||
//*[@id='mainBodyArea']
 | 
			
		||||
 | 
			
		||||
Cracked.com
 | 
			
		||||
http://feeds.feedburner.com/CrackedRSS
 | 
			
		||||
//div[@class='content']|//section[@class='body']
 | 
			
		||||
 | 
			
		||||
TheOnion
 | 
			
		||||
http://feeds.theonion.com/*
 | 
			
		||||
//article
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user