str.decode() within EncDownload()

master
pictuga 2013-06-08 17:32:55 +02:00
parent 89354e1528
commit 0978e76356
1 changed files with 6 additions and 6 deletions

View File

@ -240,7 +240,7 @@ class XMLMap(object):
else: else:
out = self._xml.__getattr__(tag) out = self._xml.__getattr__(tag)
return unicode(out).encode('utf-8') if self._str else out return unicode(out) if self._str else out
def __getitem__(self, tag): def __getitem__(self, tag):
if self.__contains__(tag): if self.__contains__(tag):
@ -320,7 +320,8 @@ def EncDownload(url):
log('chardet') log('chardet')
enc = chardet.detect(data)['encoding'] enc = chardet.detect(data)['encoding']
return (data, enc, con.geturl()) log(enc)
return (data.decode(enc, 'replace'), con.geturl())
def Fill(rss, cache, feedurl="/", fast=False): def Fill(rss, cache, feedurl="/", fast=False):
""" Returns True when it has done its best """ """ Returns True when it has done its best """
@ -383,17 +384,16 @@ def Fill(rss, cache, feedurl="/", fast=False):
return False return False
# download # download
ddl = EncDownload(item.link) ddl = EncDownload(item.link.encode('utf-8'))
if ddl is False: if ddl is False:
log('http error') log('http error')
cache.set(item.link, 'error-http') cache.set(item.link, 'error-http')
return True return True
data, enc, url = ddl data, url = ddl
log(enc)
out = readability.Document(data.decode(enc, 'ignore'), url=url).summary(True) out = readability.Document(data, url=url).summary(True)
if 'desc' not in item or lenHTML(out) > lenHTML(item.desc): if 'desc' not in item or lenHTML(out) > lenHTML(item.desc):
item.content = out item.content = out
cache.set(item.link, out) cache.set(item.link, out)