Also make buriy's readability port compatible

Should be faster, and it now supports py3
2015-08-29 18:33:12 +02:00
parent 95d9d847e9
commit 466d8e47d6
2 changed files with 14 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ You do need:

 - [python](http://www.python.org/) >= 2.6 (python 3 is supported)
 - [lxml](http://lxml.de/) for xml parsing
- [this](https://github.com/bookieio/breadability) readability fork
+- [this](https://github.com/bookieio/breadability) or [this](https://github.com/buriy/python-readability) readability fork
 - [dateutil](http://labix.org/python-dateutil) to parse feed dates
 - [html2text](http://www.aaronsw.com/2002/html2text/)
 - [OrderedDict](https://pypi.python.org/pypi/ordereddict) if using python &lt; 2.7
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -19,7 +19,6 @@ from . import crawler
 import wsgiref.simple_server
 import wsgiref.handlers

-import breadability.readable
 from html2text import HTML2Text

 try:
@@ -80,6 +79,18 @@ def log(txt, force=False):
            print(repr(txt))


+try:
+    from readability.readability import Document
+
+    def readability(html, url=None):
+        return Document(html, url=url).summary()
+except ImportError:
+    import breadability.readable
+
+    def readability(html, url=None):
+        return breadability.readable.Article(html, url=url).readable
+
+
 def len_html(txt):
    if len(txt):
        return len(lxml.html.fromstring(txt).text_content())
@@ -282,7 +293,7 @@ def Fill(item, options, feedurl='/', fast=False):
        log('non-text page')
        return True

-    out = breadability.readable.Article(data, url=con.url).readable
+    out = readability(data, con.url)

    if options.hungry or count_words(out) > max(count_content, count_desc):
        item.push_content(out)