Use internal readability fork

Much simpler, doesn't clean the html, probably less efficient, but much faster
master
pictuga 2016-05-31 02:50:03 +02:00
parent 2b9bfb47e5
commit b14381f575
2 changed files with 98 additions and 9 deletions

View File

@ -79,16 +79,10 @@ def log(txt, force=False):
print(repr(txt))
try:
from readability.readability import Document
from . import readabilite
def readability(html, url):
return readabilite.get_article(html)
def readability(html, url=None):
return Document(html, url=url).summary()
except ImportError:
import breadability.readable
def readability(html, url=None):
return breadability.readable.Article(html, url=url).readable
def len_html(txt):

View File

@ -0,0 +1,95 @@
import lxml.etree
import lxml.html
import re
def parse(data):
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
return lxml.html.fromstring(data, parser=parser)
def count_words(string):
""" Quick word count
Simply assumes that all words are 5 letter long.
And so in about every language (sorry chinese).
Basically skips spaces in the count. """
i = 0
count = 0
try:
while True:
if string[i] not in '\n\t ':
count += 1
i += 6
else:
i += 1
except IndexError:
pass
return count
regex_bad = re.compile('|'.join(['combx', 'comment', 'community', 'disqus',
'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination', 'pager', 'popup', 'tweet',
'twitter', 'com-', 'contact', 'footnote', 'masthead', 'media', 'meta',
'outbrain', 'promo', 'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor',
'shopping', 'tags', 'tool', 'widget']), re.I)
regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I)
def score_node(node):
score = 0
if node.tag in ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']:
return 0
if isinstance(node, lxml.html.HtmlComment):
return 0
if node.tag in ['a']:
score -= 1
if node.tag in ['h1', 'h2', 'article']:
score += 8
class_id = node.get('class', '') + node.get('id', '')
score += len(regex_good.findall(class_id) * 4)
score -= len(regex_bad.findall(class_id) * 3)
score += count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) / 10.
return score
def score_all(root):
grades = {}
for item in root.iter():
score = score_node(item)
grades[item] = score
parent = item.getparent()
if parent is not None:
grades[parent] += score / 2.
gdparent = parent.getparent()
if gdparent is not None:
grades[gdparent] += score / 4.
return grades
def get_best_node(root):
return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0]
def get_article(data):
return lxml.etree.tostring(get_best_node(parse(data)))