From 993ac638a3aadd30eb962494da01e70801ad3f8d Mon Sep 17 00:00:00 2001 From: Florian Muenchbach Date: Sun, 31 Jan 2016 13:52:23 +0100 Subject: [PATCH] Added override for auto-detected character encoding of parsed pages. --- morss/crawler.py | 5 ++++- morss/morss.py | 11 ++++++----- morss/readabilite.py | 12 ++++++++---- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/morss/crawler.py b/morss/crawler.py index 263d110..85826bd 100644 --- a/morss/crawler.py +++ b/morss/crawler.py @@ -72,11 +72,14 @@ def detect_encoding(data, con=None): class EncodingFixHandler(BaseHandler): + def __init__(self, encoding=None): + self.encoding = encoding + def http_response(self, req, resp): maintype = resp.info().get('Content-Type', '').split('/')[0] if 200 <= resp.code < 300 and maintype == 'text': data = resp.read() - enc = detect_encoding(data, resp) + enc = detect_encoding(data, resp) if not self.encoding else self.encoding if enc: data = data.decode(enc, 'replace') diff --git a/morss/morss.py b/morss/morss.py index c7a9c5d..3124f5e 100644 --- a/morss/morss.py +++ b/morss/morss.py @@ -129,10 +129,11 @@ def parseOptions(options): default_handlers = [crawler.GZIPHandler(), crawler.UAHandler(DEFAULT_UA), crawler.AutoRefererHandler(), crawler.HTTPEquivHandler(), - crawler.HTTPRefreshHandler(), crawler.EncodingFixHandler()] + crawler.HTTPRefreshHandler()] -def custom_handler(accept, delay=DELAY): +def custom_handler(accept, delay=DELAY, encoding=None): handlers = default_handlers[:] + handlers.append(crawler.EncodingFixHandler(encoding)) handlers.append(crawler.ContentNegociationHandler(crawler.MIMETYPE[accept])) handlers.append(crawler.SQliteCacheHandler(delay)) @@ -266,7 +267,7 @@ def ItemFill(item, options, feedurl='/', fast=False): delay = -2 try: - con = custom_handler('html', delay).open(link, timeout=TIMEOUT) + con = custom_handler('html', delay, options.encoding).open(link, timeout=TIMEOUT) data = con.read() except (IOError, HTTPException) as e: @@ -278,7 +279,7 @@ def ItemFill(item, options, feedurl='/', fast=False): log('non-text page') return True - out = readabilite.get_article(data) + out = readabilite.get_article(data, options.encoding) if options.hungry or count_words(out) > max(count_content, count_desc): item.push_content(out) @@ -367,7 +368,7 @@ def FeedFetch(url, options): delay = 0 try: - con = custom_handler('xml', delay).open(url, timeout=TIMEOUT * 2) + con = custom_handler('xml', delay, options.encoding).open(url, timeout=TIMEOUT * 2) xml = con.read() except (HTTPError) as e: diff --git a/morss/readabilite.py b/morss/readabilite.py index 15f11b1..c424513 100644 --- a/morss/readabilite.py +++ b/morss/readabilite.py @@ -3,8 +3,12 @@ import lxml.html import re -def parse(data): - parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) +def parse(data, encoding=None): + if encoding: + parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, encoding=encoding) + else: + parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True) + return lxml.html.fromstring(data, parser=parser) @@ -149,5 +153,5 @@ def br2p(root): gdparent.insert(gdparent.index(parent)+1, new_item) -def get_article(data): - return lxml.etree.tostring(get_best_node(parse(data))) +def get_article(data, encoding=None): + return lxml.etree.tostring(get_best_node(parse(data, encoding)))