Also make buriy's readability port compatible
Should be faster, and it now supports py3master
parent
95d9d847e9
commit
466d8e47d6
|
@ -30,7 +30,7 @@ You do need:
|
||||||
|
|
||||||
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
- [python](http://www.python.org/) >= 2.6 (python 3 is supported)
|
||||||
- [lxml](http://lxml.de/) for xml parsing
|
- [lxml](http://lxml.de/) for xml parsing
|
||||||
- [this](https://github.com/bookieio/breadability) readability fork
|
- [this](https://github.com/bookieio/breadability) or [this](https://github.com/buriy/python-readability) readability fork
|
||||||
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
- [dateutil](http://labix.org/python-dateutil) to parse feed dates
|
||||||
- [html2text](http://www.aaronsw.com/2002/html2text/)
|
- [html2text](http://www.aaronsw.com/2002/html2text/)
|
||||||
- [OrderedDict](https://pypi.python.org/pypi/ordereddict) if using python < 2.7
|
- [OrderedDict](https://pypi.python.org/pypi/ordereddict) if using python < 2.7
|
||||||
|
|
|
@ -19,7 +19,6 @@ from . import crawler
|
||||||
import wsgiref.simple_server
|
import wsgiref.simple_server
|
||||||
import wsgiref.handlers
|
import wsgiref.handlers
|
||||||
|
|
||||||
import breadability.readable
|
|
||||||
from html2text import HTML2Text
|
from html2text import HTML2Text
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -80,6 +79,18 @@ def log(txt, force=False):
|
||||||
print(repr(txt))
|
print(repr(txt))
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
from readability.readability import Document
|
||||||
|
|
||||||
|
def readability(html, url=None):
|
||||||
|
return Document(html, url=url).summary()
|
||||||
|
except ImportError:
|
||||||
|
import breadability.readable
|
||||||
|
|
||||||
|
def readability(html, url=None):
|
||||||
|
return breadability.readable.Article(html, url=url).readable
|
||||||
|
|
||||||
|
|
||||||
def len_html(txt):
|
def len_html(txt):
|
||||||
if len(txt):
|
if len(txt):
|
||||||
return len(lxml.html.fromstring(txt).text_content())
|
return len(lxml.html.fromstring(txt).text_content())
|
||||||
|
@ -282,7 +293,7 @@ def Fill(item, options, feedurl='/', fast=False):
|
||||||
log('non-text page')
|
log('non-text page')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
out = breadability.readable.Article(data, url=con.url).readable
|
out = readability(data, con.url)
|
||||||
|
|
||||||
if options.hungry or count_words(out) > max(count_content, count_desc):
|
if options.hungry or count_words(out) > max(count_content, count_desc):
|
||||||
item.push_content(out)
|
item.push_content(out)
|
||||||
|
|
Loading…
Reference in New Issue