From 7211093cc5195a7aa632f22417d2dd0d56cf89ab Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 16 Jun 2014 14:00:02 +0200 Subject: [PATCH] Add :smart :noref modes, update README --- README.md | 2 ++ morss/morss.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/README.md b/README.md index f41a3f6..3b70911 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ The arguments are: - Advanced - `csv`: export to csv - `md`: convert articles to Markdown + - `nolink`: drop links, but keeps links' inner text + - `noref`: drop items' link - `cache`: only take articles from the cache (ie. don't grab new articles' content), so as to save time - `debug`: to have some feedback from the script execution. Useful for debugging - `theforce`: force download the rss feed diff --git a/morss/morss.py b/morss/morss.py index 4da85ea..9b7a912 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -11,6 +11,7 @@ from fnmatch import fnmatch import re import json +import lxml.etree import lxml.html import feeds @@ -628,6 +629,16 @@ def After(rss, options): if not options.keep: del item.desc + if options.nolink and item.content: + content = lxml.html.fromstring(item.content) + for link in content.xpath('//a'): + log(link.text_content()) + link.drop_tag() + item.content = lxml.etree.tostring(content) + + if options.noref: + item.link = '' + if options.md: conv = HTML2Text(baseurl=item.link) conv.unicode_snob = True