python - UnicodeEncodeError: 'ascii' codec can't encode character u'\u0446' in position 32: ordinal not in range(128) -
i'm trying debug code previous intern wrote , i'm having difficulties resolving issue answers other unicode error posts.
the error found in last line of function:
def dumptextpacket(self, header, buglog, offset, outfile): buglog.seek(offset) data = buglog.read( header[1] ) # header[1] = size of packet outstring = data.decode("utf-8","ignore") if(header[3] == 8): # removing ugly characters packet has btag = 8. outstring = outstring[1:] outstring = outstring.strip('\0') # remove 'null' characters text outstring = "{:.3f}".format(header[5]) + ' ms: ' + outstring # append timestamp beginning of line outfile.write(outstring)
i don't have experience unicode,so appreciate pointers issue!
edit: using python 2.7, , below entire file. thing should mention code work when parsing files, think errors on other files when timestamp gets big?
in main.py file, call method loginterpreter.execute(), , traceback gives error shown in title on line "outfile.write(outstring)", last line in dumptextpacket method called in execute method:
import sys import os struct import unpack class loginterpreter: def __init__( self ): self.rtcupdated = false self.rtcoffset = 0.0 self.lasttimestamp = 0.0 self.timerrollovercount = 0 self.thistimestamp = 0.0 self.m_rtcseconds = 0.0 self.m_starttimeinsec = 0.0 def getrtcoffset( self ): return self.m_rtcseconds - self.m_starttimeinsec def converttimestamp(self,utime,logrev): ticspersecond = 24000000.0 self.thistimestamp = utime self.rtcoffset = self.getrtcoffset() if int( logrev ) == 2: if self.rtcupdated: self.lasttimestamp = 0.0 if self.lasttimestamp > self.thistimestamp: self.timerrollovercount += 1 self.lasttimestamp = self.thistimestamp ulnumber = (-1 & 0xffffffff) return ((ulnumber/ticspersecond)*self.timerrollovercount + (utime/ticspersecond) + self.rtcoffset) * 1000.0 ########################################################################## # information header current packet looking at. # ########################################################################## def grabheader(self, buglog, offset): ''' s_pkthdrrev1 /*0*/ u16 startofpacketmarker; # ushort 2 /*2*/ u16 sizeofpacket; # ushort 2 /*4*/ u08 logrev; # uchar 1 /*5*/ u08 btag; # uchar 1 /*6*/ u16 iseq; # ushort 2 /*8*/ u32 utime; # ulong 4 ''' headersize = 12 # header size in bytes btype = 'hhbbhl' # codes our byte type buglog.seek(offset) data = buglog.read(headersize) if len(data) < headersize: print('error in format of bblog file') sys.exit() headerarray = unpack(btype, data) convertedtime = self.converttimestamp(headerarray[5],headerarray[2]) headerarray = headerarray[:5] + (convertedtime,) return headerarray ################################################################ # btag = 8 or btag = 16 --> write data logmsgs.txt # ################################################################ def dumptextpacket(self, header, buglog, offset, outfile): buglog.seek(offset) data = buglog.read( header[1] ) # header[1] = size of packet outstring = data.decode("utf-8","ignore") if(header[3] == 8): # removing ugly characters packet has btag = 8. outstring = outstring[1:] outstring = outstring.strip('\0') # remove 'null' characters text outstring = "{:.3f}".format(header[5]) + ' ms: ' + outstring # append timestamp beginning of line outfile.write(outstring) def execute(self): path = './logs/' fn in os.listdir(path): filename = fn print fn if (filename.endswith(".bin")): # if(filename.split('.')[1] == "bin"): print("parsing "+filename) outfile = open(path+filename.split('.')[0]+".txt", "w") # open file output filesize = os.path.getsize(path+filename) packetoffset = 0 open(path+filename, 'rb') buglog: while(packetoffset < filesize): currheader = self.grabheader(buglog, packetoffset) # grab header current packet packetoffset = packetoffset + 12 # increment pointer 12 bytes (size of header packet) if currheader[3]==8 or currheader[3]==16: # @ btag , see if text packet self.dumptextpacket(currheader, buglog, packetoffset, outfile) packetoffset = packetoffset + currheader[1] # move on next packet incrementing pointer size of current packet outfile.close() print(filename+" completed.")
when add 2 strings 1 of them being unicode, python 2 coerce result unicode too.
>>> 'a' + u'b' u'ab'
since used data.decode
, outstring
unicode.
when write binary file, must have byte string. python 2 attempt convert unicode string byte string, uses generic codec has: 'ascii'
. codec fails on many unicode characters, codepoint above '\u007f'
. can encode more capable codec around problem:
outfile.write(outstring.encode('utf-8'))
everything changes in python 3, won't let mix byte strings , unicode strings nor attempt automatic conversions.
Comments
Post a Comment