readabilite: threshold to detect if it contains an article

Useful for videos/images-based images
master
pictuga 2017-10-28 01:30:21 +02:00
parent 0df6409b0e
commit f563040809
2 changed files with 8 additions and 1 deletions

View File

@ -245,7 +245,8 @@ def ItemFill(item, options, feedurl='/', fast=False):
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
item.content = out
if out is not None:
item.content = out
return True

View File

@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None):
scores = score_all(html)
best = get_best_node(scores)
wc = count_words(best.text_content())
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
if wc - wca < 50 or float(wca) / wc > 0.3:
return None
if url:
best.make_links_absolute(url)