readabilite: add debug option (also used by :get)

master
pictuga 2020-04-09 19:08:13 +02:00
parent 63a06524b7
commit a32f5a8536
2 changed files with 7 additions and 6 deletions

View File

@ -661,7 +661,7 @@ def cgi_get(environ, start_response):
output = lxml.etree.tostring(html.getroottree(), encoding='utf-8') output = lxml.etree.tostring(html.getroottree(), encoding='utf-8')
elif options.get == 'article': elif options.get == 'article':
output = readabilite.get_article(data, url=con.geturl(), encoding=encoding) output = readabilite.get_article(data, url=con.geturl(), encoding=encoding, debug=options.debug)
else: else:
raise MorssException('no :get option passed') raise MorssException('no :get option passed')

View File

@ -307,7 +307,7 @@ def get_best_node(ranked_grades):
return lowest return lowest
def get_article(data, url=None, encoding=None): def get_article(data, url=None, encoding=None, debug=False):
" Input a raw html string, returns a raw html string of the article " " Input a raw html string, returns a raw html string of the article "
html = parse(data, encoding) html = parse(data, encoding)
@ -319,16 +319,17 @@ def get_article(data, url=None, encoding=None):
best = get_best_node(scores) best = get_best_node(scores)
keep_threshold = percentile([x[1] for x in scores], 0.1) if not debug:
clean_root(best, keep_threshold) keep_threshold = percentile([x[1] for x in scores], 0.1)
clean_root(best, keep_threshold)
wc = count_words(best.text_content()) wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3: if not debug and (wc - wca < 50 or float(wca) / wc > 0.3):
return None return None
if url: if url:
best.make_links_absolute(url) best.make_links_absolute(url)
return lxml.etree.tostring(best, pretty_print=True) return lxml.etree.tostring(best if not debug else html, pretty_print=True)