readabilite: threshold to detect if it contains an article
Useful for videos/images-based images
This commit is contained in:
		@@ -245,7 +245,8 @@ def ItemFill(item, options, feedurl='/', fast=False):
 | 
			
		||||
 | 
			
		||||
    out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
 | 
			
		||||
 | 
			
		||||
    item.content = out
 | 
			
		||||
    if out is not None:
 | 
			
		||||
        item.content = out
 | 
			
		||||
 | 
			
		||||
    return True
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None):
 | 
			
		||||
    scores = score_all(html)
 | 
			
		||||
    best = get_best_node(scores)
 | 
			
		||||
 | 
			
		||||
    wc = count_words(best.text_content())
 | 
			
		||||
    wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')]))
 | 
			
		||||
 | 
			
		||||
    if wc - wca < 50 or float(wca) / wc > 0.3:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    if url:
 | 
			
		||||
        best.make_links_absolute(url)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user