import lxml.etree import lxml.html import re def parse(data, encoding=None): if encoding: parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding) else: parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) return lxml.html.fromstring(data, parser=parser) def count_words(string): """ Quick word count Simply assumes that all words are 5 letter long. And so in about every language (sorry chinese). Basically skips spaces in the count. """ i = 0 count = 0 try: while True: if string[i] not in '\n\t ': count += 1 i += 6 else: i += 1 except IndexError: pass return count regex_bad = re.compile('|'.join(['comment', 'community', 'extra', 'foot', 'sponsor', 'pagination', 'pager', 'tweet', 'twitter', 'com-', 'masthead', 'media', 'meta', 'related', 'shopping', 'tags', 'tool', 'author', 'about']), re.I) regex_junk = re.compile('|'.join(['robots-nocontent', 'combx', 'disqus', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'ad-', 'agegate', 'popup', 'sharing', 'share', 'social', 'contact', 'footnote', 'outbrain', 'promo', 'scroll', 'hidden', 'widget', 'hide']), re.I) regex_good = re.compile('|'.join(['and', 'article', 'body', 'column', 'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page', 'pagination', 'post', 'text', 'blog', 'story', 'par', 'editorial']), re.I) tags_bad = ['a'] tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style', 'form', 'input', 'textarea', 'button', 'footer'] tags_good = ['h1', 'h2', 'h3', 'article', 'p', 'cite', 'section', 'img', 'figcaption', 'figure'] attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value'] def score_node(node): score = 0 if isinstance(node, lxml.html.HtmlComment): return 0 class_id = node.get('class', '') + node.get('id', '') score -= len(regex_bad.findall(class_id)) score -= len(regex_junk.findall(class_id)) score += len(regex_good.findall(class_id)) wc = count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) # the .tail part is to include *everything* in that node if wc > 10: score += 1 if wc > 20: score += 1 if wc > 30: score += 1 if node.tag in tags_bad or node.tag in tags_junk: score = -1 * abs(score) if node.tag in tags_good: score += 3 return score def score_all(root): grades = {} for item in root.iter(): score = score_node(item) grades[item] = score factor = 2 for ancestor in item.iterancestors(): if score / factor > 1: grades[ancestor] += score / factor factor *= 2 else: break return grades def write_score_all(root, grades): for node in root.iter(): node.attrib['score'] = str(int(grades[node])) def clean_html(root): for item in list(root.iter()): # list() needed to be able to remove elements while iterating # Step 1. Do we keep the node? if item.tag in tags_junk: # remove shitty tags item.getparent().remove(item) continue if item.tag in ['div'] \ and len(list(item.iterchildren())) <= 1 \ and not (item.text or '').strip() \ and not (item.tail or '').strip(): # remove div with only one item inside item.drop_tag() continue class_id = item.get('class', '') + item.get('id', '') if regex_bad.match(class_id) is not None: # remove shitty class/id item.getparent().remove(item) continue if isinstance(item, lxml.html.HtmlComment): # remove comments item.getparent().remove(item) continue # Step 2. Clean the node's attributes for attrib in item.attrib: if attrib not in attributes_fine: del item.attrib[attrib] def br2p(root): for node in list(root.iterfind('.//br')): parent = node.getparent() if parent is None: continue gdparent = parent.getparent() if gdparent is None: continue if node.tail is None: # if
is at the end of a div (to avoid having

) continue else: # set up new node new_node = lxml.html.Element(parent.tag) new_node.text = node.tail for child in node.itersiblings(): new_node.append(child) # delete br node.tail = None parent.remove(node) gdparent.insert(gdparent.index(parent)+1, new_node) def lowest_common_ancestor(nodeA, nodeB, max_depth=None): ancestorsA = list(nodeA.iterancestors()) ancestorsB = list(nodeB.iterancestors()) if max_depth is not None: ancestorsA = ancestorsA[:max_depth] ancestorsB = ancestorsB[:max_depth] ancestorsA.insert(0, nodeA) ancestorsB.insert(0, nodeB) for ancestorA in ancestorsA: if ancestorA in ancestorsB: return ancestorA return nodeA # should always find one tho, at least , but needed for max_depth def rank_nodes(grades): return sorted(grades.items(), key=lambda x: x[1], reverse=True) def get_best_node(grades, highlight=False): top = rank_nodes(grades) lowest = lowest_common_ancestor(top[0][0], top[1][0], 3) if highlight: top[0][0].attrib['style'] = 'border: 2px solid blue' top[1][0].attrib['style'] = 'border: 2px solid green' lowest.attrib['style'] = 'outline: 2px solid red' return lowest def get_article(data, url=None, encoding=None): html = parse(data, encoding) clean_html(html) br2p(html) scores = score_all(html) best = get_best_node(scores) if url: best.make_links_absolute(url) return lxml.etree.tostring(best)