python - UnicodeEncodeError: 'ascii' codec can't encode character u'\u0446' in position 32: ordinal not in range(128) -


i'm trying debug code previous intern wrote , i'm having difficulties resolving issue answers other unicode error posts.

the error found in last line of function:

    def dumptextpacket(self, header, buglog, offset, outfile):         buglog.seek(offset)         data = buglog.read( header[1] )        # header[1] = size of packet          outstring = data.decode("utf-8","ignore")         if(header[3] == 8): # removing ugly characters packet has btag = 8.             outstring = outstring[1:]             outstring = outstring.strip('\0')  # remove 'null' characters text         outstring = "{:.3f}".format(header[5]) + ' ms: ' + outstring              # append timestamp beginning of line         outfile.write(outstring) 

i don't have experience unicode,so appreciate pointers issue!


edit: using python 2.7, , below entire file. thing should mention code work when parsing files, think errors on other files when timestamp gets big?

in main.py file, call method loginterpreter.execute(), , traceback gives error shown in title on line "outfile.write(outstring)", last line in dumptextpacket method called in execute method:

import sys import os struct import unpack class loginterpreter:  def __init__( self ):     self.rtcupdated = false     self.rtcoffset = 0.0     self.lasttimestamp = 0.0     self.timerrollovercount = 0     self.thistimestamp = 0.0      self.m_rtcseconds = 0.0     self.m_starttimeinsec = 0.0  def getrtcoffset( self ):     return self.m_rtcseconds - self.m_starttimeinsec  def converttimestamp(self,utime,logrev):     ticspersecond = 24000000.0      self.thistimestamp = utime     self.rtcoffset = self.getrtcoffset()      if int( logrev ) == 2:         if self.rtcupdated:             self.lasttimestamp = 0.0         if self.lasttimestamp > self.thistimestamp:             self.timerrollovercount += 1         self.lasttimestamp = self.thistimestamp      ulnumber = (-1 & 0xffffffff)      return ((ulnumber/ticspersecond)*self.timerrollovercount + (utime/ticspersecond) + self.rtcoffset) * 1000.0  ########################################################################## # information header current packet looking at. #                                    ########################################################################## def grabheader(self, buglog, offset):     '''     s_pkthdrrev1     /*0*/    u16 startofpacketmarker; # ushort 2        /*2*/    u16 sizeofpacket;        # ushort 2        /*4*/    u08 logrev;              # uchar  1         /*5*/    u08 btag;                # uchar  1         /*6*/    u16 iseq;                # ushort 2        /*8*/    u32 utime;               # ulong  4     '''     headersize = 12 # header size in bytes     btype = 'hhbbhl' # codes our byte type     buglog.seek(offset)     data = buglog.read(headersize)      if len(data) < headersize:         print('error in format of bblog file')         sys.exit()      headerarray = unpack(btype, data)     convertedtime = self.converttimestamp(headerarray[5],headerarray[2])     headerarray = headerarray[:5] + (convertedtime,)     return headerarray  ################################################################ # btag = 8 or btag = 16 --> write data logmsgs.txt # ################################################################ def dumptextpacket(self, header, buglog, offset, outfile):     buglog.seek(offset)     data = buglog.read( header[1] )                               # header[1] = size of packet      outstring = data.decode("utf-8","ignore")     if(header[3] == 8):                                           # removing ugly characters packet has btag = 8.         outstring = outstring[1:]         outstring = outstring.strip('\0')                         # remove 'null' characters text     outstring = "{:.3f}".format(header[5]) + ' ms: ' + outstring  # append timestamp beginning of line     outfile.write(outstring)    def execute(self):     path = './logs/'     fn in os.listdir(path):         filename = fn         print fn         if (filename.endswith(".bin")):          # if(filename.split('.')[1] == "bin"):             print("parsing "+filename)             outfile = open(path+filename.split('.')[0]+".txt", "w")           # open file output             filesize = os.path.getsize(path+filename)             packetoffset = 0             open(path+filename, 'rb') buglog:                 while(packetoffset < filesize):                     currheader = self.grabheader(buglog, packetoffset)        # grab header current packet                     packetoffset = packetoffset + 12                          # increment pointer 12 bytes (size of header packet)                     if currheader[3]==8 or currheader[3]==16:                 # @ btag , see if text packet                         self.dumptextpacket(currheader, buglog, packetoffset, outfile)                     packetoffset = packetoffset + currheader[1]               # move on next packet incrementing pointer size of current packet             outfile.close()             print(filename+" completed.") 

when add 2 strings 1 of them being unicode, python 2 coerce result unicode too.

>>> 'a' + u'b' u'ab' 

since used data.decode, outstring unicode.

when write binary file, must have byte string. python 2 attempt convert unicode string byte string, uses generic codec has: 'ascii'. codec fails on many unicode characters, codepoint above '\u007f'. can encode more capable codec around problem:

outfile.write(outstring.encode('utf-8')) 

everything changes in python 3, won't let mix byte strings , unicode strings nor attempt automatic conversions.


Comments