From a35225a23431e27f0189eb71496608b0769cea23 Mon Sep 17 00:00:00 2001 From: pictuga Date: Mon, 6 Apr 2015 23:12:50 +0800 Subject: [PATCH] 2to3: fix feedify string handling --- morss/feedify.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/morss/feedify.py b/morss/feedify.py index 81f2ee9..094fe46 100644 --- a/morss/feedify.py +++ b/morss/feedify.py @@ -108,18 +108,22 @@ class Builder(object): self.link = link self.cache = cache - if data is None: - data = urlopen(link).read() self.data = data + if self.data is None: + self.data = urlopen(link).read() + + self.encoding = crawler.detect_encoding(self.data) + + if isinstance(self.data, bytes): + self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace') + self.rule = get_rule(link) if self.rule['mode'] == 'xpath': - if isinstance(self.data, bytes): - self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace') self.doc = lxml.html.fromstring(self.data) elif self.rule['mode'] == 'json': - self.doc = json.loads(data) + self.doc = json.loads(self.data) self.feed = feeds.FeedParserAtom() @@ -133,7 +137,7 @@ class Builder(object): a = [html] b = [] for x in expr.strip(".").split("."): - match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() + match = re.search('^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() for elem in a: if isinstance(elem, dict): kids = elem.get(match[0]) @@ -166,10 +170,12 @@ class Builder(object): out.append(match) elif isinstance(match, lxml.html.HtmlElement): out.append(lxml.html.tostring(match)) - return out elif self.rule['mode'] == 'json': - return self.raw(html, expr) + out = self.raw(html, expr) + + out = [x.decode(self.encoding) if isinstance(x, bytes) else x for x in out] + return out def string(self, html, expr): " Makes a formatted string out of the getter and rule "