2to3: fix feedify string handling
parent
1b4fc88ad0
commit
a35225a234
|
@ -108,18 +108,22 @@ class Builder(object):
|
||||||
self.link = link
|
self.link = link
|
||||||
self.cache = cache
|
self.cache = cache
|
||||||
|
|
||||||
if data is None:
|
|
||||||
data = urlopen(link).read()
|
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
|
if self.data is None:
|
||||||
|
self.data = urlopen(link).read()
|
||||||
|
|
||||||
|
self.encoding = crawler.detect_encoding(self.data)
|
||||||
|
|
||||||
|
if isinstance(self.data, bytes):
|
||||||
|
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
|
||||||
|
|
||||||
self.rule = get_rule(link)
|
self.rule = get_rule(link)
|
||||||
|
|
||||||
if self.rule['mode'] == 'xpath':
|
if self.rule['mode'] == 'xpath':
|
||||||
if isinstance(self.data, bytes):
|
|
||||||
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
|
|
||||||
self.doc = lxml.html.fromstring(self.data)
|
self.doc = lxml.html.fromstring(self.data)
|
||||||
elif self.rule['mode'] == 'json':
|
elif self.rule['mode'] == 'json':
|
||||||
self.doc = json.loads(data)
|
self.doc = json.loads(self.data)
|
||||||
|
|
||||||
self.feed = feeds.FeedParserAtom()
|
self.feed = feeds.FeedParserAtom()
|
||||||
|
|
||||||
|
@ -133,7 +137,7 @@ class Builder(object):
|
||||||
a = [html]
|
a = [html]
|
||||||
b = []
|
b = []
|
||||||
for x in expr.strip(".").split("."):
|
for x in expr.strip(".").split("."):
|
||||||
match = re.search(r'^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
match = re.search('^([^\[]+)(?:\[([0-9]+)\])?$', x).groups()
|
||||||
for elem in a:
|
for elem in a:
|
||||||
if isinstance(elem, dict):
|
if isinstance(elem, dict):
|
||||||
kids = elem.get(match[0])
|
kids = elem.get(match[0])
|
||||||
|
@ -166,10 +170,12 @@ class Builder(object):
|
||||||
out.append(match)
|
out.append(match)
|
||||||
elif isinstance(match, lxml.html.HtmlElement):
|
elif isinstance(match, lxml.html.HtmlElement):
|
||||||
out.append(lxml.html.tostring(match))
|
out.append(lxml.html.tostring(match))
|
||||||
return out
|
|
||||||
|
|
||||||
elif self.rule['mode'] == 'json':
|
elif self.rule['mode'] == 'json':
|
||||||
return self.raw(html, expr)
|
out = self.raw(html, expr)
|
||||||
|
|
||||||
|
out = [x.decode(self.encoding) if isinstance(x, bytes) else x for x in out]
|
||||||
|
return out
|
||||||
|
|
||||||
def string(self, html, expr):
|
def string(self, html, expr):
|
||||||
" Makes a formatted string out of the getter and rule "
|
" Makes a formatted string out of the getter and rule "
|
||||||
|
|
Loading…
Reference in New Issue