From 466d8e47d6adb979d85ff8d2d9615d3b100a5966 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 29 Aug 2015 18:33:12 +0200 Subject: [PATCH] Also make buriy's readability port compatible Should be faster, and it now supports py3 --- README.md | 2 +- morss/morss.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c1f9527..9381c47 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ You do need: - [python](http://www.python.org/) >= 2.6 (python 3 is supported) - [lxml](http://lxml.de/) for xml parsing -- [this](https://github.com/bookieio/breadability) readability fork +- [this](https://github.com/bookieio/breadability) or [this](https://github.com/buriy/python-readability) readability fork - [dateutil](http://labix.org/python-dateutil) to parse feed dates - [html2text](http://www.aaronsw.com/2002/html2text/) - [OrderedDict](https://pypi.python.org/pypi/ordereddict) if using python < 2.7 diff --git a/morss/morss.py b/morss/morss.py index f692dad..9d152dc 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -19,7 +19,6 @@ from . import crawler import wsgiref.simple_server import wsgiref.handlers -import breadability.readable from html2text import HTML2Text try: @@ -80,6 +79,18 @@ def log(txt, force=False): print(repr(txt)) +try: + from readability.readability import Document + + def readability(html, url=None): + return Document(html, url=url).summary() +except ImportError: + import breadability.readable + + def readability(html, url=None): + return breadability.readable.Article(html, url=url).readable + + def len_html(txt): if len(txt): return len(lxml.html.fromstring(txt).text_content()) @@ -282,7 +293,7 @@ def Fill(item, options, feedurl='/', fast=False): log('non-text page') return True - out = breadability.readable.Article(data, url=con.url).readable + out = readability(data, con.url) if options.hungry or count_words(out) > max(count_content, count_desc): item.push_content(out)