diff --git a/morss/morss.py b/morss/morss.py index a37afb8..4f6cbe5 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -245,7 +245,8 @@ def ItemFill(item, options, feedurl='/', fast=False): out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con)) - item.content = out + if out is not None: + item.content = out return True diff --git a/morss/readabilite.py b/morss/readabilite.py index a451a3d..30ce0b3 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -258,6 +258,12 @@ def get_article(data, url=None, encoding=None): scores = score_all(html) best = get_best_node(scores) + wc = count_words(best.text_content()) + wca = count_words(' '.join([x.text_content() for x in best.findall('.//a')])) + + if wc - wca < 50 or float(wca) / wc > 0.3: + return None + if url: best.make_links_absolute(url)