2to3: first attempt to fix strings

master
pictuga 2015-02-26 00:50:23 +08:00
parent 071288015b
commit 7bd448789d
3 changed files with 19 additions and 5 deletions

View File

@ -14,9 +14,13 @@ except ImportError:
from urllib.request import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl from urllib.request import HTTPSHandler, BaseHandler, AbstractHTTPHandler, Request, addinfourl
from http.client import HTTPException, HTTPConnection, HTTPS_PORT from http.client import HTTPException, HTTPConnection, HTTPS_PORT
import re import re
try:
basestring
except NameError:
basestring = str
MIMETYPE = { MIMETYPE = {
'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'], 'xml': ['text/xml', 'application/xml', 'application/rss+xml', 'application/rdf+xml', 'application/atom+xml'],

View File

@ -18,6 +18,11 @@ except ImportError:
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from urllib.request import urlopen from urllib.request import urlopen
try:
basestring
except NameError:
basestring = str
def to_class(query): def to_class(query):
pattern = r'\[class=([^\]]+)\]' pattern = r'\[class=([^\]]+)\]'
@ -108,7 +113,7 @@ class Builder(object):
self.rule = get_rule(link) self.rule = get_rule(link)
if self.rule['mode'] == 'xpath': if self.rule['mode'] == 'xpath':
if not isinstance(self.data, unicode): if isinstance(self.data, bytes):
self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace') self.data = self.data.decode(crawler.detect_encoding(self.data), 'replace')
self.doc = lxml.html.fromstring(self.data) self.doc = lxml.html.fromstring(self.data)
elif self.rule['mode'] == 'json': elif self.rule['mode'] == 'json':

View File

@ -28,6 +28,11 @@ except ImportError:
from io import StringIO from io import StringIO
from urllib.request import urlopen from urllib.request import urlopen
try:
basestring
except NameError:
basestring = unicode = str
Element = etree.Element Element = etree.Element
@ -79,7 +84,7 @@ def parse(data):
match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100]) match = re.search('encoding=["\']?([0-9a-zA-Z-]+)', data[:100])
if match: if match:
enc = match.groups()[0].lower() enc = match.groups()[0].lower()
if not isinstance(data, unicode): if isinstance(data, bytes):
data = data.decode(enc, 'ignore') data = data.decode(enc, 'ignore')
data = data.encode(enc) data = data.encode(enc)
@ -373,8 +378,8 @@ class FeedParser(FeedBase):
out = StringIO() out = StringIO()
c = csv.writer(out, dialect=csv.excel) c = csv.writer(out, dialect=csv.excel)
for item in self.items: for item in self.items:
row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item if row = [x[1].encode('utf-8') if isinstance(x[1], unicode) else x[1] for x in item] # str
isinstance(x[1], basestring)] #isinstance(x[1], basestring)] # bytes or str
c.writerow(row) c.writerow(row)
out.seek(0) out.seek(0)
return out.read() return out.read()