readabilite: add debug option (also used by :get)
parent
63a06524b7
commit
a32f5a8536
|
@ -661,7 +661,7 @@ def cgi_get(environ, start_response):
|
||||||
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
|
||||||
|
|
||||||
elif options.get == 'article':
|
elif options.get == 'article':
|
||||||
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
|
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise MorssException('no :get option passed')
|
raise MorssException('no :get option passed')
|
||||||
|
|
|
@ -307,7 +307,7 @@ def get_best_node(ranked_grades):
|
||||||
return lowest
|
return lowest
|
||||||
|
|
||||||
|
|
||||||
def get_article(data, url=None, encoding=None):
|
def get_article(data, url=None, encoding=None, debug=False):
|
||||||
" Input a raw html string, returns a raw html string of the article "
|
" Input a raw html string, returns a raw html string of the article "
|
||||||
|
|
||||||
html = parse(data, encoding)
|
html = parse(data, encoding)
|
||||||
|
@ -319,16 +319,17 @@ def get_article(data, url=None, encoding=None):
|
||||||
|
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
||||||
|
if not debug:
|
||||||
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
keep_threshold = percentile([x[1] for x in scores], 0.1)
|
||||||
clean_root(best, keep_threshold)
|
clean_root(best, keep_threshold)
|
||||||
|
|
||||||
wc = count_words(best.text_content())
|
wc = count_words(best.text_content())
|
||||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||||
|
|
||||||
if wc - wca < 50 or float(wca) / wc > 0.3:
|
if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
best.make_links_absolute(url)
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
return lxml.etree.tostring(best, pretty_print=True)
|
return lxml.etree.tostring(best if not debug else html, pretty_print=True)
|
||||||
|
|
Loading…
Reference in New Issue