2to3: fix feedify string handling

master
pictuga 2015-04-06 23:12:50 +08:00
parent 1b4fc88ad0
commit a35225a234
1 changed files with 14 additions and 8 deletions

View File

@ -108,18 +108,22 @@ class Builder(object):
self.link = link self.link = link
self.cache = cache self.cache = cache
if data is None:
data = urlopen(link).read()
self.data = data self.data = data
if self.data is None:
self.data = urlopen(link).read()
self.encoding = crawler.detect_encoding(self.data)
if isinstance(self.data, bytes):
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
self.rule = get_rule(link) self.rule = get_rule(link)
if self.rule['mode'] == 'xpath': if self.rule['mode'] == 'xpath':
if isinstance(self.data, bytes):
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data) self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json': elif self.rule['mode'] == 'json':
self.doc = json.loads(data) self.doc = json.loads(self.data)
self.feed = feeds.FeedParserAtom() self.feed = feeds.FeedParserAtom()
@ -133,7 +137,7 @@ class Builder(object):
a = [html] a = [html]
b = [] b = []
for x in expr.strip(".").split("."): for x in expr.strip(".").split("."):
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups() match = re.search('^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
for elem in a: for elem in a:
if isinstance(elem, dict): if isinstance(elem, dict):
kids = elem.get(match[0]) kids = elem.get(match[0])
@ -166,10 +170,12 @@ class Builder(object):
out.append(match) out.append(match)
elif isinstance(match, lxml.html.HtmlElement): elif isinstance(match, lxml.html.HtmlElement):
out.append(lxml.html.tostring(match)) out.append(lxml.html.tostring(match))
return out
elif self.rule['mode'] == 'json': elif self.rule['mode'] == 'json':
return self.raw(html, expr) out = self.raw(html, expr)
out = [x.decode(self.encoding) if isinstance(x, bytes) else x for x in out]
return out
def string(self, html, expr): def string(self, html, expr):
" Makes a formatted string out of the getter and rule " " Makes a formatted string out of the getter and rule "