使用Python 2.7读取和写入CSV文件,包括unicode
我是Python的新手,我有一个关于如何使用Python来读取和写入CSV文件的问题。 我的文件包含像德国,法国等。根据我的代码,这些文件可以在Python中正确读取,但是当我把它写入一个新的CSV文件时,Unicode变成一些奇怪的字符。
数据如下所示:
而我的代码是:
import csv f=open('xxx.csv','rb') reader=csv.reader(f) wt=open('lll.csv','wb') writer=csv.writer(wt,quoting=csv.QUOTE_ALL) wt.close() f.close()
其结果是:
你能告诉我该怎么办才能解决这个问题吗? 非常感谢你!
确保你编码和解码适当。
这个例子将UTF-8中的一些示例文本循环到一个csv文件,然后返回来演示:
# -*- coding: utf-8 -*- import csv tests={'German': [u'Straße',u'auslösen',u'zerstören'], 'French': [u'français',u'américaine',u'épais'], 'Chinese': [u'中國的',u'英語',u'美國人']} with open('/tmp/utf.csv','w') as fout: writer=csv.writer(fout) writer.writerows([tests.keys()]) for row in zip(*tests.values()): row=[s.encode('utf-8') for s in row] writer.writerows([row]) with open('/tmp/utf.csv','r') as fin: reader=csv.reader(fin) for row in reader: temp=list(row) fmt=u'{:<15}'*len(temp) print fmt.format(*[s.decode('utf-8') for s in temp])
打印:
German Chinese French Straße 中國的 français auslösen 英語 américaine zerstören 美國人 épais
另一种select:
使用unicodecsv软件包中的代码…
https://pypi.python.org/pypi/unicodecsv/
>>> import unicodecsv as csv >>> from io import BytesIO >>> f = BytesIO() >>> w = csv.writer(f, encoding='utf-8') >>> _ = w.writerow((u'é', u'ñ')) >>> _ = f.seek(0) >>> r = csv.reader(f, encoding='utf-8') >>> next(r) == [u'é', u'ñ'] True
该模块与STDLIB csv模块API兼容。
在csv模块文档的末尾有一个例子,演示了如何处理Unicode。 下面是从该示例直接复制。 请注意,读取或写入的string将是Unicodestring。 例如,不要将字节string传递给UnicodeWriter.writerows
。
import csv,codecs,cStringIO class UTF8Recoder: def __init__(self, f, encoding): self.reader = codecs.getreader(encoding)(f) def __iter__(self): return self def next(self): return self.reader.next().encode("utf-8") class UnicodeReader: def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds): f = UTF8Recoder(f, encoding) self.reader = csv.reader(f, dialect=dialect, **kwds) def next(self): '''next() -> unicode This function reads and returns the next line as a Unicode string. ''' row = self.reader.next() return [unicode(s, "utf-8") for s in row] def __iter__(self): return self class UnicodeWriter: def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds): self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): '''writerow(unicode) -> None This function takes a Unicode string and encodes it to the output. ''' self.writer.writerow([s.encode("utf-8") for s in row]) data = self.queue.getvalue() data = data.decode("utf-8") data = self.encoder.encode(data) self.stream.write(data) self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row) with open('xxx.csv','rb') as fin, open('lll.csv','wb') as fout: reader = UnicodeReader(fin) writer = UnicodeWriter(fout,quoting=csv.QUOTE_ALL) for line in reader: writer.writerow(line)
input(UTF-8编码):
American,美国人French,法国人German,德国人
输出:
"American","美国人" "French","法国人" "German","德国人"
我有同样的问题。 答案是你已经做对了。 这是MS Excel的问题。 尝试用另一个编辑器打开文件,你会发现你的编码已经成功。 为了使MS Excel高兴,从UTF-8移到UTF-16。 这应该工作:
class UnicodeWriter: def __init__(self, f, dialect=csv.excel_tab, encoding="utf-16", **kwds): # Redirect output to a queue self.queue = StringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f # Force BOM if encoding=="utf-16": import codecs f.write(codecs.BOM_UTF16) self.encoding = encoding def writerow(self, row): # Modified from original: now using unicode(s) to deal with eg ints self.writer.writerow([unicode(s).encode("utf-8") for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and reencode it into the target encoding data = data.encode(self.encoding) # strip BOM if self.encoding == "utf-16": data = data[2:] # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row)
我不能回应上面的Mark,但是我只做了一个修改,修正了如果单元格中的数据不是unicode(即float或int数据)所导致的错误。 我把这一行换成了UnicodeWriter函数:“self.writer.writerow([s.encode(”utf-8“)if type(s)== types.UnicodeType else s for s in row])” :
class UnicodeWriter: def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds): self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): '''writerow(unicode) -> None This function takes a Unicode string and encodes it to the output. ''' self.writer.writerow([s.encode("utf-8") if type(s)==types.UnicodeType else s for s in row]) data = self.queue.getvalue() data = data.decode("utf-8") data = self.encoder.encode(data) self.stream.write(data) self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row)
您还需要“导入types”。