From f5630408094ebd9e207f9c631e6a0f4c892f95c9 Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 28 Oct 2017 01:30:21 +0200 Subject: [PATCH] readabilite: threshold to detect if it contains an article Useful for videos/images-based images --- morss/morss.py | 3 ++- morss/readabilite.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/morss/morss.py b/morss/morss.py index a37afb8..4f6cbe5 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -245,7 +245,8 @@ def ItemFill(item, options, feedurl='/', fast=False): out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con)) - item.content = out + if out is not None: + item.content = out return True diff --git a/morss/readabilite.py b/morss/readabilite.py index a451a3d..30ce0b3 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None): scores = score_all(html) best = get_best_node(scores) + wc = count_words(best.text_content()) + wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) + + if wc - wca < 50 or float(wca) / wc > 0.3: + return None + if url: best.make_links_absolute(url)