readabilite: threshold to detect if it contains an article
Useful for videos/images-based imagesmaster
parent
0df6409b0e
commit
f563040809
|
@ -245,6 +245,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
||||||
|
|
||||||
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
||||||
|
|
||||||
|
if out is not None:
|
||||||
item.content = out
|
item.content = out
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None):
|
||||||
scores = score_all(html)
|
scores = score_all(html)
|
||||||
best = get_best_node(scores)
|
best = get_best_node(scores)
|
||||||
|
|
||||||
|
wc = count_words(best.text_content())
|
||||||
|
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||||
|
|
||||||
|
if wc - wca < 50 or float(wca) / wc > 0.3:
|
||||||
|
return None
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
best.make_links_absolute(url)
|
best.make_links_absolute(url)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue