Added override for auto-detected character encoding of parsed pages.
parent
627163abff
commit
993ac638a3
|
@ -72,11 +72,14 @@ def detect_encoding(data, con=None):
|
|||
|
||||
|
||||
class EncodingFixHandler(BaseHandler):
|
||||
def __init__(self, encoding=None):
|
||||
self.encoding = encoding
|
||||
|
||||
def http_response(self, req, resp):
|
||||
maintype = resp.info().get('Content-Type', '').split('/')[0]
|
||||
if 200 <= resp.code < 300 and maintype == 'text':
|
||||
data = resp.read()
|
||||
enc = detect_encoding(data, resp)
|
||||
enc = detect_encoding(data, resp) if not self.encoding else self.encoding
|
||||
|
||||
if enc:
|
||||
data = data.decode(enc, 'replace')
|
||||
|
|
|
@ -129,10 +129,11 @@ def parseOptions(options):
|
|||
|
||||
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
|
||||
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
|
||||
crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()]
|
||||
crawler.HTTPRefreshHandler()]
|
||||
|
||||
def custom_handler(accept, delay=DELAY):
|
||||
def custom_handler(accept, delay=DELAY, encoding=None):
|
||||
handlers = default_handlers[:]
|
||||
handlers.append(crawler.EncodingFixHandler(encoding))
|
||||
handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept]))
|
||||
handlers.append(crawler.SQliteCacheHandler(delay))
|
||||
|
||||
|
@ -266,7 +267,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||
delay = -2
|
||||
|
||||
try:
|
||||
con = custom_handler('html', delay).open(link, timeout=TIMEOUT)
|
||||
con = custom_handler('html', delay, options.encoding).open(link, timeout=TIMEOUT)
|
||||
data = con.read()
|
||||
|
||||
except (IOError, HTTPException) as e:
|
||||
|
@ -278,7 +279,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
|
|||
log('non-text page')
|
||||
return True
|
||||
|
||||
out = readabilite.get_article(data)
|
||||
out = readabilite.get_article(data, options.encoding)
|
||||
|
||||
if options.hungry or count_words(out) > max(count_content, count_desc):
|
||||
item.push_content(out)
|
||||
|
@ -367,7 +368,7 @@ def FeedFetch(url, options):
|
|||
delay = 0
|
||||
|
||||
try:
|
||||
con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2)
|
||||
con = custom_handler('xml', delay, options.encoding).open(url, timeout=TIMEOUT * 2)
|
||||
xml = con.read()
|
||||
|
||||
except (HTTPError) as e:
|
||||
|
|
|
@ -3,8 +3,12 @@ import lxml.html
|
|||
import re
|
||||
|
||||
|
||||
def parse(data):
|
||||
def parse(data, encoding=None):
|
||||
if encoding:
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
|
||||
else:
|
||||
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
|
||||
|
||||
return lxml.html.fromstring(data, parser=parser)
|
||||
|
||||
|
||||
|
@ -149,5 +153,5 @@ def br2p(root):
|
|||
gdparent.insert(gdparent.index(parent)+1, new_item)
|
||||
|
||||
|
||||
def get_article(data):
|
||||
return lxml.etree.tostring(get_best_node(parse(data)))
|
||||
def get_article(data, encoding=None):
|
||||
return lxml.etree.tostring(get_best_node(parse(data, encoding)))
|
||||
|
|
Loading…
Reference in New Issue