From 0978e763566ac22f6b4f121f61e0b330803cdf9c Mon Sep 17 00:00:00 2001 From: pictuga Date: Sat, 8 Jun 2013 17:32:55 +0200 Subject: [PATCH] str.decode() within EncDownload() --- morss.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/morss.py b/morss.py index 70d9b16..20c6a3a 100644 --- a/morss.py +++ b/morss.py @@ -240,7 +240,7 @@ class XMLMap(object): else: out = self._xml.__getattr__(tag) - return unicode(out).encode('utf-8') if self._str else out + return unicode(out) if self._str else out def __getitem__(self, tag): if self.__contains__(tag): @@ -320,7 +320,8 @@ def EncDownload(url): log('chardet') enc = chardet.detect(data)['encoding'] - return (data, enc, con.geturl()) + log(enc) + return (data.decode(enc, 'replace'), con.geturl()) def Fill(rss, cache, feedurl="/", fast=False): """ Returns True when it has done its best """ @@ -383,17 +384,16 @@ def Fill(rss, cache, feedurl="/", fast=False): return False # download - ddl = EncDownload(item.link) + ddl = EncDownload(item.link.encode('utf-8')) if ddl is False: log('http error') cache.set(item.link, 'error-http') return True - data, enc, url = ddl - log(enc) + data, url = ddl - out = readability.Document(data.decode(enc, 'ignore'), url=url).summary(True) + out = readability.Document(data, url=url).summary(True) if 'desc' not in item or lenHTML(out) > lenHTML(item.desc): item.content = out cache.set(item.link, out)