readabilite: threshold to detect if it contains an article
Useful for videos/images-based imagesmaster
parent
0df6409b0e
commit
f563040809
|
@ -245,6 +245,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||
|
||||
out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
|
||||
|
||||
if out is not None:
|
||||
item.content = out
|
||||
|
||||
return True
|
||||
|
|
|
@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None):
|
|||
scores = score_all(html)
|
||||
best = get_best_node(scores)
|
||||
|
||||
wc = count_words(best.text_content())
|
||||
wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
|
||||
|
||||
if wc - wca < 50 or float(wca) / wc > 0.3:
|
||||
return None
|
||||
|
||||
if url:
|
||||
best.make_links_absolute(url)
|
||||
|
||||
|
|
Loading…
Reference in New Issue