Use internal readability fork

Much simpler, doesn't clean the html, probably less efficient, but much faster
2016-05-31 02:50:03 +02:00
parent 2b9bfb47e5
commit b14381f575
2 changed files with 98 additions and 9 deletions
--- a/morss/morss.py
+++ b/morss/morss.py
@@ -79,16 +79,10 @@ def log(txt, force=False):
            print(repr(txt))


-try:
-    from readability.readability import Document
+from . import readabilite
+def readability(html, url):
+    return readabilite.get_article(html)

-    def readability(html, url=None):
-        return Document(html, url=url).summary()
-except ImportError:
-    import breadability.readable
-
-    def readability(html, url=None):
-        return breadability.readable.Article(html, url=url).readable


 def len_html(txt):
--- a/morss/readabilite.py
+++ b/morss/readabilite.py
@@ -0,0 +1,95 @@
+import lxml.etree
+import lxml.html
+import re
+
+
+def parse(data):
+    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
+    return lxml.html.fromstring(data, parser=parser)
+
+
+def count_words(string):
+    """ Quick word count
+
+    Simply assumes that all words are 5 letter long.
+    And so in about every language (sorry chinese).
+    Basically skips spaces in the count. """
+
+    i = 0
+    count = 0
+
+    try:
+        while True:
+            if string[i] not in '\n\t ':
+                count += 1
+                i += 6
+            else:
+                i += 1
+    except IndexError:
+        pass
+
+    return count
+
+
+regex_bad = re.compile('|'.join(['combx', 'comment', 'community', 'disqus',
+     'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
+     'sponsor', 'ad-break', 'agegate', 'pagination', 'pager', 'popup', 'tweet',
+     'twitter', 'com-', 'contact', 'footnote', 'masthead', 'media', 'meta',
+     'outbrain', 'promo', 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor',
+     'shopping', 'tags', 'tool', 'widget']), re.I)
+
+regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
+      'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
+      'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I)
+
+
+def score_node(node):
+    score = 0
+
+    if node.tag in ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']:
+        return 0
+
+    if isinstance(node, lxml.html.HtmlComment):
+        return 0
+
+    if node.tag in ['a']:
+        score -= 1
+
+    if node.tag in ['h1', 'h2', 'article']:
+        score += 8
+
+    class_id = node.get('class', '') + node.get('id', '')
+
+    score += len(regex_good.findall(class_id) * 4)
+    score -= len(regex_bad.findall(class_id) * 3)
+
+    score += count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) / 10.
+
+    return score
+
+
+def score_all(root):
+    grades = {}
+
+    for item in root.iter():
+        score = score_node(item)
+
+        grades[item] = score
+
+        parent = item.getparent()
+        if parent is not None:
+            grades[parent] += score / 2.
+
+            gdparent = parent.getparent()
+            if gdparent is not None:
+                grades[gdparent] += score / 4.
+
+    return grades
+
+
+def get_best_node(root):
+    return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0]
+
+
+def get_article(data):
+    return lxml.etree.tostring(get_best_node(parse(data)))