Detect encoding everytime
This commit is contained in:
		@@ -15,7 +15,7 @@ import dateutil.parser
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
 | 
			
		||||
import lxml.html
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
from .readabilite import parse as html_parse
 | 
			
		||||
 | 
			
		||||
json.encoder.c_make_encoder = None
 | 
			
		||||
 | 
			
		||||
@@ -53,7 +53,7 @@ def parse_rules(filename=None):
 | 
			
		||||
    return rules
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse(data, url=None, mimetype=None):
 | 
			
		||||
def parse(data, url=None, mimetype=None, encoding=None):
 | 
			
		||||
    " Determine which ruleset to use "
 | 
			
		||||
 | 
			
		||||
    rulesets = parse_rules()
 | 
			
		||||
@@ -67,7 +67,7 @@ def parse(data, url=None, mimetype=None):
 | 
			
		||||
                for path in ruleset['path']:
 | 
			
		||||
                    if fnmatch(url, path):
 | 
			
		||||
                        parser = [x for x in parsers if x.mode == ruleset['mode']][0]
 | 
			
		||||
                        return parser(data, ruleset) 
 | 
			
		||||
                        return parser(data, ruleset, encoding=encoding) 
 | 
			
		||||
 | 
			
		||||
    # 2) Look for a parser based on mimetype
 | 
			
		||||
 | 
			
		||||
@@ -86,7 +86,7 @@ def parse(data, url=None, mimetype=None):
 | 
			
		||||
            # 'path' as they should have been caught beforehands
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            feed = parser(data)
 | 
			
		||||
            feed = parser(data, encoding=encoding)
 | 
			
		||||
 | 
			
		||||
        except (ValueError):
 | 
			
		||||
            # parsing did not work
 | 
			
		||||
@@ -113,7 +113,7 @@ def parse(data, url=None, mimetype=None):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ParserBase(object):
 | 
			
		||||
    def __init__(self, data=None, rules=None, parent=None):
 | 
			
		||||
    def __init__(self, data=None, rules=None, parent=None, encoding=None):
 | 
			
		||||
        if rules is None:
 | 
			
		||||
            rules = parse_rules()[self.default_ruleset]
 | 
			
		||||
 | 
			
		||||
@@ -122,9 +122,10 @@ class ParserBase(object):
 | 
			
		||||
        if data is None:
 | 
			
		||||
            data = rules['base']
 | 
			
		||||
 | 
			
		||||
        self.root = self.parse(data)
 | 
			
		||||
        self.parent = parent
 | 
			
		||||
        self.encoding = encoding
 | 
			
		||||
 | 
			
		||||
        self.root = self.parse(data)
 | 
			
		||||
 | 
			
		||||
    def parse(self, raw):
 | 
			
		||||
        pass
 | 
			
		||||
@@ -442,8 +443,7 @@ class ParserHTML(ParserXML):
 | 
			
		||||
    mimetype = ['text/html', 'application/xhtml+xml']
 | 
			
		||||
 | 
			
		||||
    def parse(self, raw):
 | 
			
		||||
        parser = etree.HTMLParser(remove_blank_text=True) # remove_blank_text needed for pretty_print
 | 
			
		||||
        return etree.fromstring(BeautifulSoup(raw, 'lxml').prettify('utf-8'), parser)
 | 
			
		||||
        return html_parse(raw, encoding=self.encoding)
 | 
			
		||||
 | 
			
		||||
    def tostring(self, encoding='unicode', **k):
 | 
			
		||||
        return lxml.html.tostring(self.root, encoding=encoding, **k)
 | 
			
		||||
 
 | 
			
		||||
@@ -10,7 +10,6 @@ import re
 | 
			
		||||
 | 
			
		||||
import lxml.etree
 | 
			
		||||
import lxml.html
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
from . import feeds
 | 
			
		||||
from . import crawler
 | 
			
		||||
@@ -261,7 +260,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
 | 
			
		||||
        log('non-text page')
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    out = readabilite.get_article(data, link, options.encoding or crawler.detect_encoding(data, con))
 | 
			
		||||
    out = readabilite.get_article(data, url=con.geturl(), encoding=encoding)
 | 
			
		||||
 | 
			
		||||
    if out is not None:
 | 
			
		||||
        item.content = out
 | 
			
		||||
@@ -329,7 +328,7 @@ def FeedFetch(url, options):
 | 
			
		||||
 | 
			
		||||
    if options.items:
 | 
			
		||||
        # using custom rules
 | 
			
		||||
        rss = feeds.FeedHTML(xml)
 | 
			
		||||
        rss = feeds.FeedHTML(xml, encoding=encoding)
 | 
			
		||||
 | 
			
		||||
        rss.rules['title'] = options.title              if options.title        else '//head/title'
 | 
			
		||||
        rss.rules['desc'] = options.desc                if options.desc         else '//head/meta[@name="description"]/@content'
 | 
			
		||||
@@ -349,7 +348,7 @@ def FeedFetch(url, options):
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        try:
 | 
			
		||||
            rss = feeds.parse(xml, url, contenttype)
 | 
			
		||||
            rss = feeds.parse(xml, url, contenttype, encoding=encoding)
 | 
			
		||||
            rss = rss.convert(feeds.FeedXML)
 | 
			
		||||
                # contains all fields, otherwise much-needed data can be lost
 | 
			
		||||
 | 
			
		||||
@@ -649,7 +648,7 @@ def cgi_page(environ, start_response):
 | 
			
		||||
    data, con, contenttype, encoding = crawler.adv_get(url=url)
 | 
			
		||||
 | 
			
		||||
    if contenttype in ['text/html', 'application/xhtml+xml', 'application/xml']:
 | 
			
		||||
        html = lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify())
 | 
			
		||||
        html = readabilite.parse(data, encoding=encoding)
 | 
			
		||||
        html.make_links_absolute(con.geturl())
 | 
			
		||||
 | 
			
		||||
        kill_tags = ['script', 'iframe', 'noscript']
 | 
			
		||||
 
 | 
			
		||||
@@ -6,11 +6,14 @@ import re
 | 
			
		||||
 | 
			
		||||
def parse(data, encoding=None):
 | 
			
		||||
    if encoding:
 | 
			
		||||
        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
 | 
			
		||||
    else:
 | 
			
		||||
        parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
 | 
			
		||||
        data = BeautifulSoup(data, 'lxml', from_encoding=encoding).prettify('utf-8')
 | 
			
		||||
 | 
			
		||||
    return lxml.html.fromstring(BeautifulSoup(data, 'lxml').prettify('utf-8'), parser=parser)
 | 
			
		||||
    else:
 | 
			
		||||
        data = BeautifulSoup(data, 'lxml').prettify('utf-8')
 | 
			
		||||
 | 
			
		||||
    parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding='utf-8')
 | 
			
		||||
 | 
			
		||||
    return lxml.html.fromstring(data, parser=parser)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def count_words(string):
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user