From af8879049f2149e8fcf68a7a4d4487c9f6d66d2e Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 15 Apr 2013 18:51:55 +0200 Subject: [PATCH] Another huge commit. Now uses OOP where it fits. Atom feeds are supported, but no real tests were made. Unix globbing is now possible for urls. Caching is done a cleaner way. Feedburner links are also replaced. HTML is cleaned a more efficient way. Code is now much cleaner, using lxml.objectify and a small wrapper to access Atom feeds as if they were RSS feeds (and much faster than feedparser). README has been updated. --- README.md | 9 +- cache/.htaccess | 1 - morss.py | 483 ++++++++++++++++++++++++++++++------------------ rules | 30 ++- 4 files changed, 339 insertions(+), 184 deletions(-) delete mode 100644 cache/.htaccess diff --git a/README.md b/README.md index 2816d43..a949cb9 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ This tool's goal is to get full-text RSS feeds out of striped RSS feeds, commonly available on internet. Indeed most newspapers only make a small description available to users in their rss feeds, which makes the RSS feed rather useless. So this tool intends to fix that problem. This tool opens the links from the rss feed, then downloads the full article from the newspaper website and puts it back in the rss feed. +morss also has experimental support for Atom feeds. + ##(xpath) Rules To find the article content on the newspaper's website, morss need to know where to look at. The default target is the first `

` element, since it's a common practice, or a `
` element, for HTML5 compliant websites. @@ -19,9 +21,12 @@ Here, xpath rules stored in the `rules` file. (The name of the file can be chang Fancy name (description)(useless but not optional) http://example.com/path/to/the/rss/feed.xml + http://example.co.uk/other/*/path/with/wildcard/*.xml //super/accurate[@xpath='expression']/.. -Works like a charm with Tiny TinyRSS (). +As shown in the example, multiple urls can be specified for a single rule, so as to be able to match feeds from different locations of the website server (for example with or without "www."). Moreover feeds urls can be *NIX glob-style patterns, so as to match any feed from a website. + +Works like a charm with Tiny Tiny RSS (). ###As a newsreader hook @@ -51,4 +56,4 @@ Unwanted HTML elements are also stripped from the article. By default, elements --- GPL3 licence. -Python **2.6** required (not 3). +Python **2.6**+ required (not 3). diff --git a/cache/.htaccess b/cache/.htaccess deleted file mode 100644 index d4e561c..0000000 --- a/cache/.htaccess +++ /dev/null @@ -1 +0,0 @@ -DefaultType text/html diff --git a/morss.py b/morss.py index c7a95b0..f032699 100644 --- a/morss.py +++ b/morss.py @@ -1,15 +1,42 @@ #!/usr/bin/env python import sys import os -from os.path import expanduser -from lxml import etree +import copy +from base64 import b64encode, b64decode +from fnmatch import fnmatch +import os.path +import lxml.etree +import lxml.objectify +import lxml.html +import lxml.html.clean +import lxml.builder import re import string import urllib2 from cookielib import CookieJar import chardet +# DISCLAIMER: feedparser is pure shit if you intend to *edit* the feed. + SERVER = True +MAX = 70 +TRASH = ['//h1', '//header'] +E = lxml.objectify.E + +ITEM_MAP = { + 'link': (('{http://www.w3.org/2005/Atom}link', 'href'), '{}link'), + 'desc': ('{http://www.w3.org/2005/Atom}summary', '{}description'), + 'description': ('{http://www.w3.org/2005/Atom}summary', '{}description'), + 'summary': ('{http://www.w3.org/2005/Atom}summary', '{}description'), + 'content': ('{http://www.w3.org/2005/Atom}content', '{http://purl.org/rss/1.0/modules/content/}encoded') + } +RSS_MAP = { + 'desc': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'), + 'description': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'), + 'subtitle': ('{http://www.w3.org/2005/Atom}subtitle', '{}description'), + 'item': ('{http://www.w3.org/2005/Atom}entry', '{}item'), + 'entry': ('{http://www.w3.org/2005/Atom}entry', '{}item') + } if SERVER: import httplib @@ -23,215 +50,317 @@ def log(txt): print txt if SERVER: with open('morss.log', 'a') as file: - if isinstance(txt, str): - file.write(txt.encode('utf-8') + "\n") + file.write(str(txt).encode('utf-8') + "\n") -class Info: - def __init__(self, item, feed): - self.item = item - self.feed = feed +def cleanXML(xml): + table = string.maketrans('', '') + return xml.translate(table, table[:32]).lstrip() - self.data = False - self.page = False - self.html = False - self.con = False - self.opener = False - self.enc = False +class Cache: + """Light, error-prone caching system.""" + def __init__(self, folder, key): + self._key = key + self._dir = folder + self._file = self._dir + "/" + str(hash(self._key)) + self._cached = {} # what *was* cached + self._cache = {} # new things to put in cache - self.link = self.item.xpath('link')[0] - self.desc = self.item.xpath('description')[0] + if os.path.exists(self._file): + data = open(self._file).read().strip().split("\n") + for line in data: + key, bdata = line.split("\t") + self._cached[key] = bdata - def checkURL(self): - if self.link.text.startswith("http://rss.feedsportal.com"): - log('feedsportal') - url = re.search('/([0-9a-zA-Z]+)/[a-zA-Z0-9\.]+$', self.link.text).groups()[0].split('0') - t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.'} - self.link.text = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]]) - log(self.link.text) + log(str(hash(self._key))) - def fetch(self): - log(self.link.text) - self.checkURL() - if not self.findCache(): - self.download() - self.chardet() - self.fetchDesc() - self.save() - log(self.enc) - - def parseHTML(self): - if self.enc is False: - self.page = etree.HTML(self.data) + def get(self, key): + if key in self._cached: + return b64decode(self._cached[key]) else: - try: - self.page = etree.HTML(self.data.decode(self.enc, 'ignore')) - except ValueError: - self.page = etree.HTML(self.data) + return None + def save(self, key, content): + # Maybe, appending to file when adding new elements could be + # a good idea, but that'd require to check a couple of things, + # like whether it has aleardy been over-written (ie. whether + # it no longer contains self._cached) - def save(self): - self.feed.save() + self._cache[key] = b64encode(content) - def findCache(self): - if self.feed.cache is not False: - xpath = "//link[text()='" + self.link.text + "']/../description/text()" - match = self.feed.cache.xpath(xpath) - if len(match): - log('cached') - self.desc.text = match[0] - return True + txt = "" + for (key, bdata) in self._cache.iteritems(): + txt += "\n" + str(key) + "\t" + bdata + txt.strip() + + if not os.path.exists(self._dir): + os.makedirs(self._dir) + + open(self._file, 'w').write(txt) + +class XMLMap(object): + """ + Sort of wrapper around lxml.objectify.StringElement (from which this + class *DOESN'T* inherit) which makes "links" between different children + of an element. For example, this allows cheap, efficient, transparent + RSS 2.0/Atom seamless use, which can be way faster than feedparser, and + has the advantage to edit the corresponding mapped fields. On top of + that, XML output with "classic" lxml API calls (such as + lxml.etree.tostring) is still possible. Element attributes are also + supported (as in ). + + However, keep in mind that this feature's support is only partial. For + example if you want to alias an element to both value and , and put them as ('el', ('el', 'value')) in the _map + definition, then only 'el' will be whatched, even if ('el', 'value') + makes more sens in that specific case, because that would require to + also check the others, in case of "better" match, which is not done now. + + Also, this class assumes there's some consistency in the _map + definition. Which means that it expects matches to be always found in + the same "column" in _map. This is useful when setting values which are + not yet in the XML tree. Indeed the class will try to use the alias from + the same column. With the RSS/Atom example, the default _map will always + create elements for the same kind of feed. + """ + def __init__(self, obj, alias=ITEM_MAP, string=False): + self._xml = obj + self._key = None + self._map = alias + self._str = string + + self._guessKey() + self._E = E #lxml.objectify.ElementMaker(annotate=False) + + def _guessKey(self): + for tag in self._map: + self._key = 0 + for choice in self._map[tag]: + if not isinstance(choice, tuple): + choice = (choice, None) + el, attr = choice + if hasattr(self._xml, el): + if attr is None: + return + else: + if attr in self._xml[el].attrib: + return + self._key+=1 + self._key = 0 + + def _getElement(self, tag): + """Returns a tuple whatsoever.""" + if tag in self._map: + for choice in self._map[tag]: + if not isinstance(choice, tuple): + choice = (choice, None) + el, attr = choice + if hasattr(self._xml, el): + if attr is None: + return (self._xml[el], attr) + else: + if attr in self._xml[el].attrib: + return (self._xml[el], attr) + return (None, None) + if hasattr(self._xml, tag): + return (self._xml[tag], None) + return (None, None) + + def __getattr__(self, tag): + el, attr = self._getElement(tag) + if el is not None: + if attr is None: + out = el + else: + out = el.get(attr) + else: + out = self._xml.__getattr__(tag) + + return unicode(out) if self._str else out + + def __getitem__(self, tag): + return self.__getattr__(tag) + + def __setattr__(self, tag, value): + if tag.startswith('_'): + return object.__setattr__(self, tag, value) + + el, attr = self._getElement(tag) + if el is not None: + if attr is None: + if (isinstance(value, lxml.objectify.StringElement) + or isinstance(value, str) + or isinstance(value, unicode)): + el._setText(value) + else: + el = value + return + else: + el.set(attr, value) + return + choice = self._map[tag][self._key] + if not isinstance(choice, tuple): + child = lxml.objectify.Element(choice) + self._xml.append(child) + self._xml[choice] = value + return + else: + el, attr = choice + child = lxml.objectify.Element(choice, attrib={attr:value}) + self._xml.append(child) + return + + def __contains__(self, tag): + el, attr = self._getElement(tag) + return el is not None + + def remove(self): + self._xml.getparent().remove(self._xml) + + def tostring(self, **k): + """Returns string using lxml. Arguments passed to tostring.""" + out = self._xml if self._xml.getparent() is None else self._xml.getparent() + return lxml.etree.tostring(out, pretty_print=True, **k) + +def EncDownload(url): + try: + cj = CookieJar() + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) + con = opener.open(url) + data = con.read() + except (urllib2.HTTPError, urllib2.URLError) as error: + log(error) + log('http error') return False - def fetchDesc(self): - self.parseHTML() - match = self.page.xpath(self.feed.rule) + if con.headers.getparam('charset'): + log('header') + enc = con.headers.getparam('charset') + else: + match = re.search('charset=["\']?([0-9a-zA-Z-]+)', data).groups() if len(match): - self.html = match[0] - self.deleteTags() - self.desc.text = etree.tostring(self.html).decode(self.enc, 'ignore') - log('ok txt') + log('meta.re') + enc = match[0] else: - log('no match') + log('chardet') + enc = chardet.detect(data)['encoding'] - def download(self): - try: - cj = CookieJar() - self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) - self.con = self.opener.open(self.link.text.encode('utf-8')) - self.data = self.con.read() - except (urllib2.HTTPError, urllib2.URLError) as error: - log(error) - log('http error') + return (data, enc) - def chardet(self): - if self.con.headers.getparam('charset'): - log('header') - self.enc = self.con.headers.getparam('charset') - return +def parseRules(rulePath, url): + rules = open(rulePath, "r").read().strip().split("\n\n") + rules = [r.split('\n') for r in rules] + for rule in rules: + for domain in rule[1:-1]: + if fnmatch(url, domain): + return rule[-1] + return '//article|//h1/..' - page = etree.HTML(self.data) - header = page.xpath("//head/meta[@http-equiv='Content-Type']/@content") - if len(header) and len(header[0].split("=")): - log('meta') - self.enc = header[0].split("=")[1] - return +def Fill(rss, rule, cache): + item = XMLMap(rss, ITEM_MAP, True) + log(item.link) - header = page.xpath("//head/meta[@charset]/@charset") - if len(header): - log('meta2') - self.enc = header[0] - return + # content already provided? + if 'content' in item: + if len(item.content) > 4*len(item.desc): + return item - log('chardet') - self.enc = chardet.detect(self.data)['encoding'] + # check link + if fnmatch(item.link, "http://*.feedsportal.com/*"): + url = re.search('/([0-9a-zA-Z]+)/[^/]+$', item.link).groups()[0].split('0') + t = {'A':'0', 'B':'.', 'C':'/', 'D':'?', 'E':'-', 'L':'ww', 'S':'w.', 'O':'.co.uk'} + item.link = 'http://' + "".join([(t[s[0]] if s[0] in t else "=") + s[1:] for s in url[1:]]) + if '{http://rssnamespace.org/feedburner/ext/1.0}origLink' in item: + item.link = item['{http://rssnamespace.org/feedburner/ext/1.0}origLink'] - def deleteTags(self): - for tag in self.feed.trash: - for elem in self.html.xpath(tag): - elem.getparent().remove(elem) + # check cache + cached = cache.get(item.link) + if cached is not None: + log('cached') + item.content = cached + return item -class Feed: - def __init__(self, impl, data, cachePath): - self.rulePath = 'rules' - self.rule = '//article|//h1/..' + # download + ddl = EncDownload(item.link) - self.trash = ['//script', '//iframe', '//object', '//noscript', '//form', '//h1'] - self.max = 70 + if ddl is False: + return item - self.cachePath = cachePath - self.cacheFile = False - self.cache = False - self.impl = impl + data, enc = ddl + log(enc) - self.items = [] - self.rss = False - self.out = False + # parse + parser = lxml.html.HTMLParser(encoding=enc) + page = lxml.etree.fromstring(data, parser) - if self.impl == 'server': - self.url = data - self.xml = False + # filter + match = page.xpath(rule) + if len(match): + art = match[0] + log('ok txt') + else: + log('no match') + return item + + # clean + for tag in TRASH: + for elem in art.xpath(tag): + elem.getparent().remove(elem) + + art.tag = 'div' # solves crash in lxml.html.clean + art = lxml.html.clean.clean_html(art) + out = lxml.etree.tostring(art, pretty_print=True).decode(enc, 'ignore') + item.content = out + cache.save(item.link, out) + +def Gather(data, cachePath): + # fetch feed + if data.startswith("http"): + req = urllib2.Request(data) + req.add_unredirected_header('User-Agent', '') + xml = urllib2.urlopen(req).read() + else: + xml = data + + xml = cleanXML(xml) + rss = lxml.objectify.fromstring(xml) + root = rss.channel if hasattr(rss, 'channel') else rss + root = XMLMap(root, RSS_MAP) + + cache = Cache(cachePath, unicode(root.title)) + + # rules + if data.startswith("http"): + rule = parseRules('rules', url) + else: + if len(sys.argv) > 1: + rule = sys.argv[1] else: - self.url = False - self.xml = data + rule = '//article|//h1/..' - def save(self): - self.out = etree.tostring(self.rss, xml_declaration=True, pretty_print=True) - open(self.cacheFile, 'w').write(self.out) + # set + log(rule) + if MAX: + for item in root.item[MAX:]: + item.getparent().remove(item) + for item in root.item: + Fill(item, rule, cache) - def getData(self): - if self.impl == 'server': - req = urllib2.Request(self.url) - req.add_unredirected_header('User-Agent', '') - self.xml = urllib2.urlopen(req).read() - self.cleanXml() - - def setCache(self): - if self.cache is not False: - return - - self.parse() - key = str(hash(self.rss.xpath('//channel/title/text()')[0])) - self.cacheFile = self.cachePath + "/" + key - log(self.cacheFile) - if not os.path.exists(self.cachePath): - os.makedirs(self.cachePath) - - if os.path.exists(self.cacheFile): - self.cache = etree.XML(open(self.cacheFile, 'r').read()) - - def parse(self): - if self.rss is not False: - return - - self.rss = etree.XML(self.xml) - - def setItems(self): - self.items = [Info(e, self) for e in self.rss.xpath('//item')] - if self.max: - self.items = self.items[:self.max] - - def fill(self): - self.parseRules() - log(self.rule) - for item in self.items: - item.fetch() - - def cleanXml(self): - table = string.maketrans('', '') - self.xml = self.xml.translate(table, table[:32]).lstrip() - - def parseRules(self): - if self.impl == 'server': - rules = open(self.rulePath, "r").read().split("\n\n") - rules = [r.split('\n') for r in rules] - for rule in rules: - if rule[1] == self.url: - self.rule = rule[2] - return - else: - if len(sys.argv) > 1: - self.rule = sys.argv[1] + return root.tostring(xml_declaration=True, encoding='UTF-8') if __name__ == "__main__": if SERVER: print 'Content-Type: text/html\n' url = os.environ['REQUEST_URI'][len(os.environ['SCRIPT_NAME'])+1:] url = 'http://' + url.replace(' ', '%20') + cache = os.getcwd() + '/cache' log(url) - RSS = Feed('server', url, os.getcwd() + '/cache') + RSS = Gather(url, cache) else: - xml = sys.stdin.read() - cache = expanduser('~') + '/.cache/morss' - RSS = Feed('liferea', xml, os.getcwd() + '/cache') - - RSS.getData() - RSS.parse() - RSS.setCache() - RSS.setItems() - RSS.fill() - RSS.save() + xml = sys.stdin.read() + cache = os.path.expanduser('~') + '/.cache/morss' + RSS = Gather(xml, cache) if SERVER or not os.getenv('DEBUG', False): - print RSS.out - else: - print 'done' + print RSS + + log('done') diff --git a/rules b/rules index b1e6890..7bb9cdd 100644 --- a/rules +++ b/rules @@ -1,15 +1,37 @@ TehranTimes -http://www.tehrantimes.com/component/ninjarsssyndicator/?feed_id=1&format=raw +http://www.tehrantimes.com/* +http://tehrantimes.com/* //div[@class='article-indent'] FranceInfo -http://www.franceinfo.fr/rss.xml +http://www.franceinfo.fr/rss* //h2[@class='chapo']/.. +Les Echos +http://rss.feedsportal.com/c/499/f/413829/index.rss +http://syndication.lesechos.fr/rss/* +//h1/../.. + Spiegel -http://www.spiegel.de/schlagzeilen/tops/index.rss +http://www.spiegel.de/schlagzeilen/* //div[@id='spArticleSection'] Le Soir -http://www.lesoir.be/feed/La%20Une/destination_une_block/ +http://www.lesoir.be/feed/* //div[@class='article-content'] + +Stack Overflow +http://stackoverflow.com/feeds/* +//*[@id='question'] + +Daily Telegraph +http://www.telegraph.co.uk/* +//*[@id='mainBodyArea'] + +Cracked.com +http://feeds.feedburner.com/CrackedRSS +//div[@class='content']|//section[@class='body'] + +TheOnion +http://feeds.theonion.com/* +//article