Change morss.py to use feeds.py
No other changes should appear in this commit
This commit is contained in:
		
							
								
								
									
										170
									
								
								morss.py
									
									
									
									
									
								
							
							
						
						
									
										170
									
								
								morss.py
									
									
									
									
									
								
							@@ -8,12 +8,12 @@ from base64 import b64encode, b64decode
 | 
				
			|||||||
import re
 | 
					import re
 | 
				
			||||||
import string
 | 
					import string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import lxml.etree
 | 
					 | 
				
			||||||
import lxml.objectify
 | 
					 | 
				
			||||||
import lxml.html
 | 
					import lxml.html
 | 
				
			||||||
import lxml.html.clean
 | 
					import lxml.html.clean
 | 
				
			||||||
import lxml.builder
 | 
					import lxml.builder
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import feeds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import urllib2
 | 
					import urllib2
 | 
				
			||||||
import socket
 | 
					import socket
 | 
				
			||||||
from cookielib import CookieJar
 | 
					from cookielib import CookieJar
 | 
				
			||||||
@@ -163,132 +163,6 @@ class Cache:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
		return time.time() - os.path.getmtime(self._file) < sec
 | 
							return time.time() - os.path.getmtime(self._file) < sec
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class XMLMap(object):
 | 
					 | 
				
			||||||
	"""
 | 
					 | 
				
			||||||
	Sort of wrapper around lxml.objectify.StringElement (from which this
 | 
					 | 
				
			||||||
	class *DOESN'T* inherit) which makes "links" between different children
 | 
					 | 
				
			||||||
	of an element. For example, this allows cheap, efficient, transparent
 | 
					 | 
				
			||||||
	RSS 2.0/Atom seamless use, which can be way faster than feedparser, and
 | 
					 | 
				
			||||||
	has the advantage to edit the corresponding mapped fields. On top of
 | 
					 | 
				
			||||||
	that, XML output with "classic" lxml API calls (such as
 | 
					 | 
				
			||||||
	lxml.etree.tostring) is still possible. Element attributes are also
 | 
					 | 
				
			||||||
	supported (as in <entry attr='value'/>).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	However, keep in mind that this feature's support is only partial. For
 | 
					 | 
				
			||||||
	example if you want to alias an element to both <el>value</el> and <el
 | 
					 | 
				
			||||||
	href='value'/>, and put them as ('el', ('el', 'value')) in the _map
 | 
					 | 
				
			||||||
	definition, then only 'el' will be whatched, even if ('el', 'value')
 | 
					 | 
				
			||||||
	makes more sens in that specific case, because that would require to
 | 
					 | 
				
			||||||
	also check the others, in case of "better" match, which is not done now.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	Also, this class assumes there's some consistency in the _map
 | 
					 | 
				
			||||||
	definition. Which means that it expects matches to be always found in
 | 
					 | 
				
			||||||
	the same "column" in _map. This is useful when setting values which are
 | 
					 | 
				
			||||||
	not yet in the XML tree. Indeed the class will try to use the alias from
 | 
					 | 
				
			||||||
	the same column. With the RSS/Atom example, the default _map will always
 | 
					 | 
				
			||||||
	create elements for the same kind of feed.
 | 
					 | 
				
			||||||
	"""
 | 
					 | 
				
			||||||
	def __init__(self, obj, alias=ITEM_MAP, string=False):
 | 
					 | 
				
			||||||
		self._xml = obj
 | 
					 | 
				
			||||||
		self._key = None
 | 
					 | 
				
			||||||
		self._map = alias
 | 
					 | 
				
			||||||
		self._str = string
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		self._guessKey()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def _guessKey(self):
 | 
					 | 
				
			||||||
		for tag in self._map:
 | 
					 | 
				
			||||||
			self._key = 0
 | 
					 | 
				
			||||||
			for choice in self._map[tag]:
 | 
					 | 
				
			||||||
				if not isinstance(choice, tuple):
 | 
					 | 
				
			||||||
					choice = (choice, None)
 | 
					 | 
				
			||||||
				el, attr = choice
 | 
					 | 
				
			||||||
				if hasattr(self._xml, el):
 | 
					 | 
				
			||||||
					if attr is None:
 | 
					 | 
				
			||||||
						return
 | 
					 | 
				
			||||||
					else:
 | 
					 | 
				
			||||||
						if attr in self._xml[el].attrib:
 | 
					 | 
				
			||||||
							return
 | 
					 | 
				
			||||||
				self._key+=1
 | 
					 | 
				
			||||||
		self._key = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def _getElement(self, tag):
 | 
					 | 
				
			||||||
		"""Returns a tuple whatsoever."""
 | 
					 | 
				
			||||||
		if tag in self._map:
 | 
					 | 
				
			||||||
			for choice in self._map[tag]:
 | 
					 | 
				
			||||||
				if not isinstance(choice, tuple):
 | 
					 | 
				
			||||||
					choice = (choice, None)
 | 
					 | 
				
			||||||
				el, attr = choice
 | 
					 | 
				
			||||||
				if hasattr(self._xml, el):
 | 
					 | 
				
			||||||
					if attr is None:
 | 
					 | 
				
			||||||
						return (self._xml[el], attr)
 | 
					 | 
				
			||||||
					else:
 | 
					 | 
				
			||||||
						if attr in self._xml[el].attrib:
 | 
					 | 
				
			||||||
							return (self._xml[el], attr)
 | 
					 | 
				
			||||||
			return (None, None)
 | 
					 | 
				
			||||||
		if hasattr(self._xml, tag):
 | 
					 | 
				
			||||||
			return (self._xml[tag], None)
 | 
					 | 
				
			||||||
		return (None, None)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def __getattr__(self, tag):
 | 
					 | 
				
			||||||
		el, attr = self._getElement(tag)
 | 
					 | 
				
			||||||
		if el is not None:
 | 
					 | 
				
			||||||
			if attr is None:
 | 
					 | 
				
			||||||
				out = el
 | 
					 | 
				
			||||||
			else:
 | 
					 | 
				
			||||||
				out = el.get(attr)
 | 
					 | 
				
			||||||
		else:
 | 
					 | 
				
			||||||
			out = self._xml.__getattr__(tag)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		return unicode(out) if self._str else out
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def __getitem__(self, tag):
 | 
					 | 
				
			||||||
		if self.__contains__(tag):
 | 
					 | 
				
			||||||
			return self.__getattr__(tag)
 | 
					 | 
				
			||||||
		else:
 | 
					 | 
				
			||||||
			return None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def __setattr__(self, tag, value):
 | 
					 | 
				
			||||||
		if tag.startswith('_'):
 | 
					 | 
				
			||||||
			return object.__setattr__(self, tag, value)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		el, attr = self._getElement(tag)
 | 
					 | 
				
			||||||
		if el is not None:
 | 
					 | 
				
			||||||
			if attr is None:
 | 
					 | 
				
			||||||
				if (isinstance(value, lxml.objectify.StringElement)
 | 
					 | 
				
			||||||
					or isinstance(value, str)
 | 
					 | 
				
			||||||
					or isinstance(value, unicode)):
 | 
					 | 
				
			||||||
					el._setText(value)
 | 
					 | 
				
			||||||
				else:
 | 
					 | 
				
			||||||
					el = value
 | 
					 | 
				
			||||||
				return
 | 
					 | 
				
			||||||
			else:
 | 
					 | 
				
			||||||
				el.set(attr, value)
 | 
					 | 
				
			||||||
				return
 | 
					 | 
				
			||||||
		choice = self._map[tag][self._key]
 | 
					 | 
				
			||||||
		if not isinstance(choice, tuple):
 | 
					 | 
				
			||||||
			child = lxml.objectify.Element(choice)
 | 
					 | 
				
			||||||
			self._xml.append(child)
 | 
					 | 
				
			||||||
			self._xml[choice] = value
 | 
					 | 
				
			||||||
			return
 | 
					 | 
				
			||||||
		else:
 | 
					 | 
				
			||||||
			el, attr = choice
 | 
					 | 
				
			||||||
			child = lxml.objectify.Element(choice, attrib={attr:value})
 | 
					 | 
				
			||||||
			self._xml.append(child)
 | 
					 | 
				
			||||||
			return
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def __contains__(self, tag):
 | 
					 | 
				
			||||||
		el, attr = self._getElement(tag)
 | 
					 | 
				
			||||||
		return el is not None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def remove(self):
 | 
					 | 
				
			||||||
		self._xml.getparent().remove(self._xml)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def tostring(self, **k):
 | 
					 | 
				
			||||||
		"""Returns string using lxml. Arguments passed to tostring."""
 | 
					 | 
				
			||||||
		out = self._xml if self._xml.getparent() is None else self._xml.getparent()
 | 
					 | 
				
			||||||
		return lxml.etree.tostring(out, pretty_print=True, **k)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def EncDownload(url):
 | 
					def EncDownload(url):
 | 
				
			||||||
	try:
 | 
						try:
 | 
				
			||||||
		cj = CookieJar()
 | 
							cj = CookieJar()
 | 
				
			||||||
@@ -323,19 +197,20 @@ def EncDownload(url):
 | 
				
			|||||||
	log(enc)
 | 
						log(enc)
 | 
				
			||||||
	return (data.decode(enc, 'replace'), con.geturl())
 | 
						return (data.decode(enc, 'replace'), con.geturl())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def Fill(rss, cache, feedurl="/", fast=False):
 | 
					def Fill(item, cache, feedurl="/", fast=False):
 | 
				
			||||||
	""" Returns True when it has done its best """
 | 
						""" Returns True when it has done its best """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	item = XMLMap(rss, ITEM_MAP, True)
 | 
						if not item.link:
 | 
				
			||||||
	log(item.link)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if 'link' not in item:
 | 
					 | 
				
			||||||
		log('no link')
 | 
							log('no link')
 | 
				
			||||||
		return True
 | 
							return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						log(item.link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# feedburner
 | 
						# feedburner
 | 
				
			||||||
	if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
 | 
						feeds.NSMAP['feedburner'] = 'http://rssnamespace.org/feedburner/ext/1.0'
 | 
				
			||||||
		item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
 | 
						match = item.xval('feedburner:origLink')
 | 
				
			||||||
 | 
						if match:
 | 
				
			||||||
 | 
							item.link = match
 | 
				
			||||||
		log(item.link)
 | 
							log(item.link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# feedsportal
 | 
						# feedsportal
 | 
				
			||||||
@@ -358,12 +233,11 @@ def Fill(rss, cache, feedurl="/", fast=False):
 | 
				
			|||||||
		item.link = urlparse.urljoin(feedurl, item.link)
 | 
							item.link = urlparse.urljoin(feedurl, item.link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# check unwanted uppercase title
 | 
						# check unwanted uppercase title
 | 
				
			||||||
	if 'title' in item:
 | 
						if len(item.title) > 20 and item.title.isupper():
 | 
				
			||||||
		if len(item.title) > 20 and item.title.isupper():
 | 
							item.title = item.title.title()
 | 
				
			||||||
			item.title = item.title.title()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# content already provided?
 | 
						# content already provided?
 | 
				
			||||||
	if 'content' in item and 'desc' in item:
 | 
						if item.content and item.desc:
 | 
				
			||||||
		len_content = lenHTML(item.content)
 | 
							len_content = lenHTML(item.content)
 | 
				
			||||||
		len_desc = lenHTML(item.desc)
 | 
							len_desc = lenHTML(item.desc)
 | 
				
			||||||
		log('content: %s vs %s' % (len_content, len_desc))
 | 
							log('content: %s vs %s' % (len_content, len_desc))
 | 
				
			||||||
@@ -402,7 +276,7 @@ def Fill(rss, cache, feedurl="/", fast=False):
 | 
				
			|||||||
	data, url = ddl
 | 
						data, url = ddl
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	out = readability.Document(data, url=url).summary(True)
 | 
						out = readability.Document(data, url=url).summary(True)
 | 
				
			||||||
	if 'desc' not in item or lenHTML(out) > lenHTML(item.desc):
 | 
						if not item.desc or lenHTML(out) > lenHTML(item.desc):
 | 
				
			||||||
		item.content = out
 | 
							item.content = out
 | 
				
			||||||
		cache.set(item.link, out)
 | 
							cache.set(item.link, out)
 | 
				
			||||||
	else:
 | 
						else:
 | 
				
			||||||
@@ -429,14 +303,12 @@ def Gather(url, cachePath, mode='feed'):
 | 
				
			|||||||
			return False
 | 
								return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	xml = cleanXML(xml)
 | 
						xml = cleanXML(xml)
 | 
				
			||||||
	rss = lxml.objectify.fromstring(xml)
 | 
						rss = feeds.parse(xml)
 | 
				
			||||||
	root = rss.channel if hasattr(rss, 'channel') else rss
 | 
						size = len(rss)
 | 
				
			||||||
	root = XMLMap(root, RSS_MAP)
 | 
					 | 
				
			||||||
	size = len(root.item)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	# set
 | 
						# set
 | 
				
			||||||
	startTime = time.time()
 | 
						startTime = time.time()
 | 
				
			||||||
	for i, item in enumerate(root.item):
 | 
						for i, item in enumerate(rss.items):
 | 
				
			||||||
		if mode == 'progress':
 | 
							if mode == 'progress':
 | 
				
			||||||
			if MAX_ITEM == 0:
 | 
								if MAX_ITEM == 0:
 | 
				
			||||||
				print "%s/%s" % (i+1, size)
 | 
									print "%s/%s" % (i+1, size)
 | 
				
			||||||
@@ -445,16 +317,16 @@ def Gather(url, cachePath, mode='feed'):
 | 
				
			|||||||
			sys.stdout.flush()
 | 
								sys.stdout.flush()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if i+1 > LIM_ITEM > 0:
 | 
							if i+1 > LIM_ITEM > 0:
 | 
				
			||||||
			item.getparent().remove(item)
 | 
								item.remove()
 | 
				
			||||||
		elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0:
 | 
							elif time.time() - startTime > MAX_TIME >= 0 or i+1 > MAX_ITEM > 0:
 | 
				
			||||||
			if Fill(item, cache, url, True) is False:
 | 
								if Fill(item, cache, url, True) is False:
 | 
				
			||||||
				item.getparent().remove(item)
 | 
									item.remove()
 | 
				
			||||||
		else:
 | 
							else:
 | 
				
			||||||
			Fill(item, cache, url)
 | 
								Fill(item, cache, url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	log(len(root.item))
 | 
						log(len(rss))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return root.tostring(xml_declaration=True, encoding='UTF-8')
 | 
						return rss.tostring(xml_declaration=True, encoding='UTF-8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
	url, options = parseOptions(OPTIONS)
 | 
						url, options = parseOptions(OPTIONS)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user