Added override for auto-detected character encoding of parsed pages.

master
Florian Muenchbach 2016-01-31 13:52:23 +01:00 committed by pictuga
parent 627163abff
commit 993ac638a3
3 changed files with 18 additions and 10 deletions

View File

@ -72,11 +72,14 @@ def detect_encoding(data, con=None):
class EncodingFixHandler(BaseHandler): class EncodingFixHandler(BaseHandler):
def __init__(self, encoding=None):
self.encoding = encoding
def http_response(self, req, resp): def http_response(self, req, resp):
maintype = resp.info().get('Content-Type', '').split('/')[0] maintype = resp.info().get('Content-Type', '').split('/')[0]
if 200 <= resp.code < 300 and maintype == 'text': if 200 <= resp.code < 300 and maintype == 'text':
data = resp.read() data = resp.read()
enc = detect_encoding(data, resp) enc = detect_encoding(data, resp) if not self.encoding else self.encoding
if enc: if enc:
data = data.decode(enc, 'replace') data = data.decode(enc, 'replace')

View File

@ -129,10 +129,11 @@ def parseOptions(options):
default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA),
crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(), crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(),
crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()] crawler.HTTPRefreshHandler()]
def custom_handler(accept, delay=DELAY): def custom_handler(accept, delay=DELAY, encoding=None):
handlers = default_handlers[:] handlers = default_handlers[:]
handlers.append(crawler.EncodingFixHandler(encoding))
handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept])) handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept]))
handlers.append(crawler.SQliteCacheHandler(delay)) handlers.append(crawler.SQliteCacheHandler(delay))
@ -266,7 +267,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
delay = -2 delay = -2
try: try:
con = custom_handler('html', delay).open(link, timeout=TIMEOUT) con = custom_handler('html', delay, options.encoding).open(link, timeout=TIMEOUT)
data = con.read() data = con.read()
except (IOError, HTTPException) as e: except (IOError, HTTPException) as e:
@ -278,7 +279,7 @@ def ItemFill(item, options, feedurl='/', fast=False):
log('non-text page') log('non-text page')
return True return True
out = readabilite.get_article(data) out = readabilite.get_article(data, options.encoding)
if options.hungry or count_words(out) > max(count_content, count_desc): if options.hungry or count_words(out) > max(count_content, count_desc):
item.push_content(out) item.push_content(out)
@ -367,7 +368,7 @@ def FeedFetch(url, options):
delay = 0 delay = 0
try: try:
con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2) con = custom_handler('xml', delay, options.encoding).open(url, timeout=TIMEOUT * 2)
xml = con.read() xml = con.read()
except (HTTPError) as e: except (HTTPError) as e:

View File

@ -3,8 +3,12 @@ import lxml.html
import re import re
def parse(data): def parse(data, encoding=None):
if encoding:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding)
else:
parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True)
return lxml.html.fromstring(data, parser=parser) return lxml.html.fromstring(data, parser=parser)
@ -149,5 +153,5 @@ def br2p(root):
gdparent.insert(gdparent.index(parent)+1, new_item) gdparent.insert(gdparent.index(parent)+1, new_item)
def get_article(data): def get_article(data, encoding=None):
return lxml.etree.tostring(get_best_node(parse(data))) return lxml.etree.tostring(get_best_node(parse(data, encoding)))