import lxml.etree
import lxml.html
import re


def parse(data):
    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
    return lxml.html.fromstring(data, parser=parser)


def count_words(string):
    """ Quick word count

    Simply assumes that all words are 5 letter long.
    And so in about every language (sorry chinese).
    Basically skips spaces in the count. """

    i = 0
    count = 0

    try:
        while True:
            if string[i] not in '\n\t ':
                count += 1
                i += 6
            else:
                i += 1
    except IndexError:
        pass

    return count


regex_bad = re.compile('|'.join(['robots-nocontent', 'combx', 'comment',
    'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss',
    'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination',
    'pager', 'popup', 'tweet', 'twitter', 'com-', 'sharing', 'share', 'social',
    'contact', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
    'tool', 'widget']), re.I)

regex_good = re.compile('|'.join(['and', 'article', 'body', 'column',
    'main', 'shadow', 'content', 'entry', 'hentry', 'main', 'page',
    'pagination', 'post', 'text', 'blog', 'story', 'par']), re.I)

tags_junk = ['script', 'head', 'iframe', 'object', 'noscript', 'param', 'embed', 'layer', 'applet', 'style']

attributes_fine = ['title', 'src', 'href', 'type', 'name', 'for', 'value']


def score_node(node):
    score = 0

    if node.tag in tags_junk:
        return 0

    if isinstance(node, lxml.html.HtmlComment):
        return 0

    if node.tag in ['a']:
        score -= 1

    if node.tag in ['h1', 'h2', 'article']:
        score += 8

    class_id = node.get('class', '') + node.get('id', '')

    score += len(regex_good.findall(class_id) * 4)
    score -= len(regex_bad.findall(class_id) * 3)

    score += count_words(''.join([node.text or ''] + [x.tail or '' for x in node])) / 10. # the .tail part is to include *everything* in that node

    return score


def score_all(root):
    grades = {}

    for item in root.iter():
        score = score_node(item)

        grades[item] = score

        parent = item.getparent()
        if parent is not None:
            grades[parent] += score / 2.

            gdparent = parent.getparent()
            if gdparent is not None:
                grades[gdparent] += score / 4.

    return grades


def get_best_node(root):
    return sorted(score_all(root).items(), key=lambda x: x[1], reverse=True)[0][0]


def clean_html(root):
    for item in root.iter():
        # Step 1. Do we keep the node?

        if item.tag in tags_junk:
            item.getparent().remove(item)

        class_id = item.get('class', '') + item.get('id', '')
        if regex_bad.match(class_id):
            item.getparent().remove(item)

        if isinstance(item, lxml.html.HtmlComment):
            item.getparent().remove(item)

        # Step 2. Clean the node's attributes

        for attrib in item.attrib:
            if attrib not in attributes_fine:
                del item.attrib[attrib]


def get_article(data):
    return lxml.etree.tostring(get_best_node(parse(data)))