Another huge commit.

Now uses OOP where it fits. Atom feeds are supported, but no real tests were made. Unix globbing is now possible for urls. Caching is done a cleaner way. Feedburner links are also replaced. HTML is cleaned a more efficient way. Code is now much cleaner, using lxml.objectify and a small wrapper to access Atom feeds as if they were RSS feeds (and much faster than feedparser). README has been updated.
master
pictuga 2013-04-15 18:51:55 +02:00
parent a098b7e104
commit af8879049f
4 changed files with 339 additions and 184 deletions

View File

@ -3,6 +3,8 @@
This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem. This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem.
This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed. This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed.
morss also has experimental support for Atom feeds.
##(xpath) Rules ##(xpath) Rules
To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites. To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `<h1>` element, since it's a common practice, or a `<article>` element, for HTML5 compliant websites.
@ -19,8 +21,11 @@ Here, xpath rules stored in the `rules` file. (The name of the file can be chang
Fancy name (description)(useless but not optional) Fancy name (description)(useless but not optional)
http://example.com/path/to/the/rss/feed.xml http://example.com/path/to/the/rss/feed.xml
http://example.co.uk/other/*/path/with/wildcard/*.xml
//super/accurate[@xpath='expression']/.. //super/accurate[@xpath='expression']/..
As shown in the example, multiple urls can be specified for a single rule, so as to be able to match feeds from different locations of the website server (for example with or without "www."). Moreover feeds urls can be *NIX glob-style patterns, so as to match any feed from a website.
Works like a charm with Tiny Tiny RSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>). Works like a charm with Tiny Tiny RSS (<http://tt-rss.org/redmine/projects/tt-rss/wiki>).
###As a newsreader hook ###As a newsreader hook
@ -51,4 +56,4 @@ Unwanted HTML elements are also stripped from the article. By default, elements
--- ---
GPL3 licence. GPL3 licence.
Python **2.6** required (not 3). Python **2.6**+ required (not 3).

1
cache/.htaccess vendored
View File

@ -1 +0,0 @@
DefaultType text/html

483
morss.py
View File

@ -1,15 +1,42 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
import os import os
from os.path import expanduser import copy
from lxml import etree from base64 import b64encode, b64decode
from fnmatch import fnmatch
import os.path
import lxml.etree
import lxml.objectify
import lxml.html
import lxml.html.clean
import lxml.builder
import re import re
import string import string
import urllib2 import urllib2
from cookielib import CookieJar from cookielib import CookieJar
import chardet import chardet
# DISCLAIMER: feedparser is pure shit if you intend to *edit* the feed.
SERVER = True SERVER = True
MAX = 70
TRASH = ['//h1', '//header']
E = lxml.objectify.E
ITEM_MAP = {
'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'),
'desc': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'description': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'summary': ('{http://www.w3.org/2005/Atom}summary', '{}description'),
'content': ('{http://www.w3.org/2005/Atom}content', '{http://purl.org/rss/1.0/modules/content/}encoded')
}
RSS_MAP = {
'desc': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'description': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'subtitle': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'),
'item': ('{http://www.w3.org/2005/Atom}entry', '{}item'),
'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item')
}
if SERVER: if SERVER:
import httplib import httplib
@ -23,215 +50,317 @@ def log(txt):
print txt print txt
if SERVER: if SERVER:
with open('morss.log', 'a') as file: with open('morss.log', 'a') as file:
if isinstance(txt, str): file.write(str(txt).encode('utf-8') + "\n")
file.write(txt.encode('utf-8') + "\n")
class Info: def cleanXML(xml):
def __init__(self, item, feed): table = string.maketrans('', '')
self.item = item return xml.translate(table, table[:32]).lstrip()
self.feed = feed
self.data = False class Cache:
self.page = False """Light, error-prone caching system."""
self.html = False def __init__(self, folder, key):
self.con = False self._key = key
self.opener = False self._dir = folder
self.enc = False self._file = self._dir + "/" + str(hash(self._key))
self._cached = {} # what *was* cached
self._cache = {} # new things to put in cache
self.link = self.item.xpath('link')[0] if os.path.exists(self._file):
self.desc = self.item.xpath('description')[0] data = open(self._file).read().strip().split("\n")
for line in data:
key, bdata = line.split("\t")
self._cached[key] = bdata
def checkURL(self): log(str(hash(self._key)))
if self.link.text.startswith("http://rss.feedsportal.com"):
log('feedsportal')
url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'}
self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
log(self.link.text)
def fetch(self): def get(self, key):
log(self.link.text) if key in self._cached:
self.checkURL() return b64decode(self._cached[key])
if not self.findCache():
self.download()
self.chardet()
self.fetchDesc()
self.save()
log(self.enc)
def parseHTML(self):
if self.enc is False:
self.page = etree.HTML(self.data)
else: else:
try: return None
self.page = etree.HTML(self.data.decode(self.enc, 'ignore'))
except ValueError:
self.page = etree.HTML(self.data)
def save(self, key, content):
# Maybe, appending to file when adding new elements could be
# a good idea, but that'd require to check a couple of things,
# like whether it has aleardy been over-written (ie. whether
# it no longer contains self._cached)
def save(self): self._cache[key] = b64encode(content)
self.feed.save()
def findCache(self): txt = ""
if self.feed.cache is not False: for (key, bdata) in self._cache.iteritems():
xpath = "//link[text()='" + self.link.text + "']/../description/text()" txt += "\n" + str(key) + "\t" + bdata
match = self.feed.cache.xpath(xpath) txt.strip()
if len(match):
log('cached')
self.desc.text = match[0]
return True
return False
def fetchDesc(self): if not os.path.exists(self._dir):
self.parseHTML() os.makedirs(self._dir)
match = self.page.xpath(self.feed.rule)
if len(match): open(self._file, 'w').write(txt)
self.html = match[0]
self.deleteTags() class XMLMap(object):
self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore') """
log('ok txt') Sort of wrapper around lxml.objectify.StringElement (from which this
class *DOESN'T* inherit) which makes "links" between different children
of an element. For example, this allows cheap, efficient, transparent
RSS 2.0/Atom seamless use, which can be way faster than feedparser, and
has the advantage to edit the corresponding mapped fields. On top of
that, XML output with "classic" lxml API calls (such as
lxml.etree.tostring) is still possible. Element attributes are also
supported (as in <entry attr='value'/>).
However, keep in mind that this feature's support is only partial. For
example if you want to alias an element to both <el>value</el> and <el
href='value'/>, and put them as ('el', ('el', 'value')) in the _map
definition, then only 'el' will be whatched, even if ('el', 'value')
makes more sens in that specific case, because that would require to
also check the others, in case of "better" match, which is not done now.
Also, this class assumes there's some consistency in the _map
definition. Which means that it expects matches to be always found in
the same "column" in _map. This is useful when setting values which are
not yet in the XML tree. Indeed the class will try to use the alias from
the same column. With the RSS/Atom example, the default _map will always
create elements for the same kind of feed.
"""
def __init__(self, obj, alias=ITEM_MAP, string=False):
self._xml = obj
self._key = None
self._map = alias
self._str = string
self._guessKey()
self._E = E #lxml.objectify.ElementMaker(annotate=False)
def _guessKey(self):
for tag in self._map:
self._key = 0
for choice in self._map[tag]:
if not isinstance(choice, tuple):
choice = (choice, None)
el, attr = choice
if hasattr(self._xml, el):
if attr is None:
return
else: else:
log('no match') if attr in self._xml[el].attrib:
return
self._key+=1
self._key = 0
def download(self): def _getElement(self, tag):
"""Returns a tuple whatsoever."""
if tag in self._map:
for choice in self._map[tag]:
if not isinstance(choice, tuple):
choice = (choice, None)
el, attr = choice
if hasattr(self._xml, el):
if attr is None:
return (self._xml[el], attr)
else:
if attr in self._xml[el].attrib:
return (self._xml[el], attr)
return (None, None)
if hasattr(self._xml, tag):
return (self._xml[tag], None)
return (None, None)
def __getattr__(self, tag):
el, attr = self._getElement(tag)
if el is not None:
if attr is None:
out = el
else:
out = el.get(attr)
else:
out = self._xml.__getattr__(tag)
return unicode(out) if self._str else out
def __getitem__(self, tag):
return self.__getattr__(tag)
def __setattr__(self, tag, value):
if tag.startswith('_'):
return object.__setattr__(self, tag, value)
el, attr = self._getElement(tag)
if el is not None:
if attr is None:
if (isinstance(value, lxml.objectify.StringElement)
or isinstance(value, str)
or isinstance(value, unicode)):
el._setText(value)
else:
el = value
return
else:
el.set(attr, value)
return
choice = self._map[tag][self._key]
if not isinstance(choice, tuple):
child = lxml.objectify.Element(choice)
self._xml.append(child)
self._xml[choice] = value
return
else:
el, attr = choice
child = lxml.objectify.Element(choice, attrib={attr:value})
self._xml.append(child)
return
def __contains__(self, tag):
el, attr = self._getElement(tag)
return el is not None
def remove(self):
self._xml.getparent().remove(self._xml)
def tostring(self, **k):
"""Returns string using lxml. Arguments passed to tostring."""
out = self._xml if self._xml.getparent() is None else self._xml.getparent()
return lxml.etree.tostring(out, pretty_print=True, **k)
def EncDownload(url):
try: try:
cj = CookieJar() cj = CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
self.con = self.opener.open(self.link.text.encode('utf-8')) con = opener.open(url)
self.data = self.con.read() data = con.read()
except (urllib2.HTTPError, urllib2.URLError) as error: except (urllib2.HTTPError, urllib2.URLError) as error:
log(error) log(error)
log('http error') log('http error')
return False
def chardet(self): if con.headers.getparam('charset'):
if self.con.headers.getparam('charset'):
log('header') log('header')
self.enc = self.con.headers.getparam('charset') enc = con.headers.getparam('charset')
return
page = etree.HTML(self.data)
header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content")
if len(header) and len(header[0].split("=")):
log('meta')
self.enc = header[0].split("=")[1]
return
header = page.xpath("//head/meta[@charset]/@charset")
if len(header):
log('meta2')
self.enc = header[0]
return
log('chardet')
self.enc = chardet.detect(self.data)['encoding']
def deleteTags(self):
for tag in self.feed.trash:
for elem in self.html.xpath(tag):
elem.getparent().remove(elem)
class Feed:
def __init__(self, impl, data, cachePath):
self.rulePath = 'rules'
self.rule = '//article|//h1/..'
self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1']
self.max = 70
self.cachePath = cachePath
self.cacheFile = False
self.cache = False
self.impl = impl
self.items = []
self.rss = False
self.out = False
if self.impl == 'server':
self.url = data
self.xml = False
else: else:
self.url = False match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data).groups()
self.xml = data if len(match):
log('meta.re')
enc = match[0]
else:
log('chardet')
enc = chardet.detect(data)['encoding']
def save(self): return (data, enc)
self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True)
open(self.cacheFile, 'w').write(self.out)
def getData(self): def parseRules(rulePath, url):
if self.impl == 'server': rules = open(rulePath, "r").read().strip().split("\n\n")
req = urllib2.Request(self.url)
req.add_unredirected_header('User-Agent', '')
self.xml = urllib2.urlopen(req).read()
self.cleanXml()
def setCache(self):
if self.cache is not False:
return
self.parse()
key = str(hash(self.rss.xpath('//channel/title/text()')[0]))
self.cacheFile = self.cachePath + "/" + key
log(self.cacheFile)
if not os.path.exists(self.cachePath):
os.makedirs(self.cachePath)
if os.path.exists(self.cacheFile):
self.cache = etree.XML(open(self.cacheFile, 'r').read())
def parse(self):
if self.rss is not False:
return
self.rss = etree.XML(self.xml)
def setItems(self):
self.items = [Info(e, self) for e in self.rss.xpath('//item')]
if self.max:
self.items = self.items[:self.max]
def fill(self):
self.parseRules()
log(self.rule)
for item in self.items:
item.fetch()
def cleanXml(self):
table = string.maketrans('', '')
self.xml = self.xml.translate(table, table[:32]).lstrip()
def parseRules(self):
if self.impl == 'server':
rules = open(self.rulePath, "r").read().split("\n\n")
rules = [r.split('\n') for r in rules] rules = [r.split('\n') for r in rules]
for rule in rules: for rule in rules:
if rule[1] == self.url: for domain in rule[1:-1]:
self.rule = rule[2] if fnmatch(url, domain):
return return rule[-1]
return '//article|//h1/..'
def Fill(rss, rule, cache):
item = XMLMap(rss, ITEM_MAP, True)
log(item.link)
# content already provided?
if 'content' in item:
if len(item.content) > 4*len(item.desc):
return item
# check link
if fnmatch(item.link, "http://*.feedsportal.com/*"):
url = re.search('/([0-9a-zA-Z]+)/[^/]+$', item.link).groups()[0].split('0')
t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.', 'O':'.co.uk'}
item.link = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]])
if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item:
item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink']
# check cache
cached = cache.get(item.link)
if cached is not None:
log('cached')
item.content = cached
return item
# download
ddl = EncDownload(item.link)
if ddl is False:
return item
data, enc = ddl
log(enc)
# parse
parser = lxml.html.HTMLParser(encoding=enc)
page = lxml.etree.fromstring(data, parser)
# filter
match = page.xpath(rule)
if len(match):
art = match[0]
log('ok txt')
else:
log('no match')
return item
# clean
for tag in TRASH:
for elem in art.xpath(tag):
elem.getparent().remove(elem)
art.tag = 'div' # solves crash in lxml.html.clean
art = lxml.html.clean.clean_html(art)
out = lxml.etree.tostring(art, pretty_print=True).decode(enc, 'ignore')
item.content = out
cache.save(item.link, out)
def Gather(data, cachePath):
# fetch feed
if data.startswith("http"):
req = urllib2.Request(data)
req.add_unredirected_header('User-Agent', '')
xml = urllib2.urlopen(req).read()
else:
xml = data
xml = cleanXML(xml)
rss = lxml.objectify.fromstring(xml)
root = rss.channel if hasattr(rss, 'channel') else rss
root = XMLMap(root, RSS_MAP)
cache = Cache(cachePath, unicode(root.title))
# rules
if data.startswith("http"):
rule = parseRules('rules', url)
else: else:
if len(sys.argv) > 1: if len(sys.argv) > 1:
self.rule = sys.argv[1] rule = sys.argv[1]
else:
rule = '//article|//h1/..'
# set
log(rule)
if MAX:
for item in root.item[MAX:]:
item.getparent().remove(item)
for item in root.item:
Fill(item, rule, cache)
return root.tostring(xml_declaration=True, encoding='UTF-8')
if __name__ == "__main__": if __name__ == "__main__":
if SERVER: if SERVER:
print 'Content-Type: text/html\n' print 'Content-Type: text/html\n'
url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:] url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:]
url = 'http://' + url.replace(' ', '%20') url = 'http://' + url.replace(' ', '%20')
cache = os.getcwd() + '/cache'
log(url) log(url)
RSS = Feed('server', url, os.getcwd() + '/cache') RSS = Gather(url, cache)
else: else:
xml = sys.stdin.read() xml = sys.stdin.read()
cache = expanduser('~') + '/.cache/morss' cache = os.path.expanduser('~') + '/.cache/morss'
RSS = Feed('liferea', xml, os.getcwd() + '/cache') RSS = Gather(xml, cache)
RSS.getData()
RSS.parse()
RSS.setCache()
RSS.setItems()
RSS.fill()
RSS.save()
if SERVER or not os.getenv('DEBUG', False): if SERVER or not os.getenv('DEBUG', False):
print RSS.out print RSS
else:
print 'done' log('done')

30
rules
View File

@ -1,15 +1,37 @@
TehranTimes TehranTimes
http://www.tehrantimes.com/component/ninjarsssyndicator/?feed_id=1&format=raw http://www.tehrantimes.com/*
http://tehrantimes.com/*
//div[@class='article-indent'] //div[@class='article-indent']
FranceInfo FranceInfo
http://www.franceinfo.fr/rss.xml http://www.franceinfo.fr/rss*
//h2[@class='chapo']/.. //h2[@class='chapo']/..
Les Echos
http://rss.feedsportal.com/c/499/f/413829/index.rss
http://syndication.lesechos.fr/rss/*
//h1/../..
Spiegel Spiegel
http://www.spiegel.de/schlagzeilen/tops/index.rss http://www.spiegel.de/schlagzeilen/*
//div[@id='spArticleSection'] //div[@id='spArticleSection']
Le Soir Le Soir
http://www.lesoir.be/feed/La%20Une/destination_une_block/ http://www.lesoir.be/feed/*
//div[@class='article-content'] //div[@class='article-content']
Stack Overflow
http://stackoverflow.com/feeds/*
//*[@id='question']
Daily Telegraph
http://www.telegraph.co.uk/*
//*[@id='mainBodyArea']
Cracked.com
http://feeds.feedburner.com/CrackedRSS
//div[@class='content']|//section[@class='body']
TheOnion
http://feeds.theonion.com/*
//article