diff --git a/morss/morss.py b/morss/morss.py index 0a6ccd8..f488674 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -79,16 +79,10 @@ def log(txt, force=False): print(repr(txt)) -try: - from readability.readability import Document +from . import readabilite +def readability(html, url): + return readabilite.get_article(html) - def readability(html, url=None): - return Document(html, url=url).summary() -except ImportError: - import breadability.readable - - def readability(html, url=None): - return breadability.readable.Article(html, url=url).readable def len_html(txt): diff --git a/morss/readabilite.py b/morss/readabilite.py new file mode 100644 index 0000000..6f1e7fb --- /dev/null +++ b/morss/readabilite.py @@ -0,0 +1,95 @@ +import lxml.etree +import lxml.html +import re + + +def parse(data): + parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) + return lxml.html.fromstring(data, parser=parser) + + +def count_words(string): + """ Quick word count + + Simply assumes that all words are 5 letter long. + And so in about every language (sorry chinese). + Basically skips spaces in the count. """ + + i = 0 + count = 0 + + try: + while True: + if string[i] not in '\n\t ': + count += 1 + i += 6 + else: + i += 1 + except IndexError: + pass + + return count + + +regex_bad = re.compile('|'.join(['combx', 'comment', 'community', 'disqus', + 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', + 'sponsor', 'ad-break', 'agegate', 'pagination', 'pager', 'popup', 'tweet', + 'twitter', 'com-', 'contact', 'footnote', 'masthead', 'media', 'meta', + 'outbrain', 'promo', 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', + 'shopping', 'tags', 'tool', 'widget']), re.I) + +regex_good = re.compile('|'.join(['and', 'article', 'body', 'column', + 'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page', + 'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I) + + +def score_node(node): + score = 0 + + if node.tag in ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']: + return 0 + + if isinstance(node, lxml.html.HtmlComment): + return 0 + + if node.tag in ['a']: + score -= 1 + + if node.tag in ['h1', 'h2', 'article']: + score += 8 + + class_id = node.get('class', '') + node.get('id', '') + + score += len(regex_good.findall(class_id) * 4) + score -= len(regex_bad.findall(class_id) * 3) + + score += count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) / 10. + + return score + + +def score_all(root): + grades = {} + + for item in root.iter(): + score = score_node(item) + + grades[item] = score + + parent = item.getparent() + if parent is not None: + grades[parent] += score / 2. + + gdparent = parent.getparent() + if gdparent is not None: + grades[gdparent] += score / 4. + + return grades + + +def get_best_node(root): + return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0] + + +def get_article(data): + return lxml.etree.tostring(get_best_node(parse(data)))