From 690531c87e8080d0abccc82e2e6f1c9778909e4e Mon Sep 17 00:00:00 2001 From: Donovan Jimenez Date: Sat, 15 Jan 2011 04:06:50 -0500 Subject: [PATCH] * Added README.md * moved sample programs to examples directory * cleaned up make make file * converter.go: ConvertString now uses Convert under the hood, removes some code duplication * reader.go: No need to have two separate buffers, can write directly in to buffer given in Read call. Simplifies code greatly --- Makefile | 29 +----- README.md | 72 ++++++++++++++ converter.go | 97 ++++++++----------- sample.ebcdic-us => examples/sample.ebcdic-us | 0 sample.go => examples/sample.go | 0 sample.utf8 => examples/sample.utf8 | 0 iconv.go | 4 +- reader.go | 91 +++++++++-------- 8 files changed, 164 insertions(+), 129 deletions(-) create mode 100644 README.md rename sample.ebcdic-us => examples/sample.ebcdic-us (100%) rename sample.go => examples/sample.go (100%) rename sample.utf8 => examples/sample.utf8 (100%) diff --git a/Makefile b/Makefile index 9b6fcd8..48f735c 100644 --- a/Makefile +++ b/Makefile @@ -1,34 +1,15 @@ -# Copyright 2009 The Go Authors. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - include $(GOROOT)/src/Make.inc +# target package name TARG=iconv +# regular go files GOFILES=\ - reader.go + reader.go\ +# files that must be processed by cgo CGOFILES=\ + converter.go\ iconv.go\ - converter.go - -ifeq ($(GOOS),windows) -CGO_LDFLAGS=-liconv -endif - -# To add flags necessary for locating the library or its include files, -# set CGO_CFLAGS or CGO_LDFLAGS. For example, to use an -# alternate installation of the library: -# CGO_CFLAGS=-I/home/rsc/gmp32/include -# CGO_LDFLAGS+=-L/home/rsc/gmp32/lib -# Note the += on the second line. - -CLEANFILES+=sample include $(GOROOT)/src/Make.pkg - -# simple test program to test iconv conversion -sample: install sample.go - $(GC) $@.go - $(LD) -o $@ $@.$O diff --git a/README.md b/README.md new file mode 100644 index 0000000..4f5a950 --- /dev/null +++ b/README.md @@ -0,0 +1,72 @@ +Install +======= + +The goinstall command can be used: + + goinstall github.com/djimenez/iconv.go + +Or, you can clone the repository and use gomake instead + + git clone git://github.com/djimenez/iconv.go.git iconv + cd iconv + gomake install + +Usage +===== + +To use the package, you'll need the appropriate import statement: + + import ( + // if you used goinstall, you'll want this import + iconv "github.com/djimenez/iconv.go" + + // if you used gomake install directly, you'll want this import + iconv + ) + +Converting string Values +------------------------ + +Converting a string can be done with two methods. First, there's iconv.ConvertString(input, fromEncoding, toEncoding string) + + output,_ := iconv.ConvertString("Hello World!", "utf-8", "windows-1252") + +Alternatively, you can create a converter and use its ConvertString method. This mostly just saves having to parse the from and to encodings when converting many strings in the same way. + + converter := iconv.NewConverter("utf-8", "windows-1252") + output,_ := converter.ConvertString("Hello World!") + +Converting []byte Values +------------------------ + +Converting a []byte can similarly be done with two methods. First, there's iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll immediately notice this requires you to give it both the input and output buffer. Ideally, the output buffer should be sized so that it can hold all converted bytes from input, but if it cannot, then Convert will put as many bytes as it can into the buffer without creating an invalid sequence. For example, if iconv only has a single byte left in the output buffer but needs 2 or more for the complete character in a multibyte encoding it will stop writing to the buffer and return with an iconv.E2BIG error. + + input := []byte("Hello World!") + output := make([]byte, len(input)) + + bytesRead, bytesWritten, error := iconv.Convert(input, output, "utf-8", "windows-1252") + +Just like with ConvertString, there is also a Convert method on Converter that can be used. + + ... + converter := iconv.NewConverter("utf-8", "windows-1252") + + bytesRead, bytesWritten, error := converter.Convert(input, output) + +Converting an *io.Reader +------------------------ + +The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes transcoded as they are read. + + // We're wrapping stdin for simplicity, but a File or network reader could be wrapped as well + reader,_ := iconv.NewReader(os.Stdin, "utf-8", "windows-1252") + +Converting an *io.Writer +------------------------ + +To be written. + +Piping a Conversion +------------------- + +To be written. diff --git a/converter.go b/converter.go index 2077881..4413847 100644 --- a/converter.go +++ b/converter.go @@ -13,7 +13,7 @@ type Converter struct { open bool } -func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err os.Error) { +func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err Error) { converter = new(Converter) converter.context, err = C.iconv_open(C.CString(toEncoding), C.CString(fromEncoding)) @@ -47,21 +47,21 @@ func (this *Converter) Close() (err os.Error) { // // NOTE: not all bytes may be consumed from the input. This can be because the output // buffer is too small or because there were iconv errors -func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err os.Error) { +func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err Error) { inputLeft := C.size_t(len(input)) outputLeft := C.size_t(len(output)) - - // we're going to give iconv the pointers to the underlying - // storage of each byte slice - so far this is the simplest - // way i've found to do that in Go, but it seems ugly - inputFirstElementPointer := &input[0] - inputPointer := (**C.char)(unsafe.Pointer(&inputFirstElementPointer)) - - outputFirstElementPointer := &output[0] - outputPointer := (**C.char)(unsafe.Pointer(&outputFirstElementPointer)) - - // we're only going to make one call to iconv + if inputLeft > 0 && outputLeft > 0 { + // we're going to give iconv the pointers to the underlying + // storage of each byte slice - so far this is the simplest + // way i've found to do that in Go, but it seems ugly + inputFirstElementPointer := &input[0] + inputPointer := (**C.char)(unsafe.Pointer(&inputFirstElementPointer)) + + outputFirstElementPointer := &output[0] + outputPointer := (**C.char)(unsafe.Pointer(&outputFirstElementPointer)) + + // we're only going to make one call to iconv _,err = C.iconv(this.context, inputPointer, &inputLeft, outputPointer, &outputLeft) // update byte counters @@ -72,59 +72,42 @@ func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, byte return bytesRead, bytesWritten, err } -// convert the bytes of a string and return the resulting string -// -// TODO: can we do this in terms of Convert function -func (this *Converter) ConvertString(input string) (output string, err os.Error) { - // both our input buffer and output buffer will be the same size - // but we'll reuse our output buffer each time its filled - bufferSize := len(input) - sourceLeft := C.size_t(bufferSize) - outputLeft := sourceLeft - outputReset := outputLeft +// convert a string value, returning a new string value +func (this *Converter) ConvertString(input string) (output string, err Error) { - // our input buffer is the source string, but iconv will track - // how many bytes has left to process - sourceBuffer := C.CString(input) - sourcePointer := &sourceBuffer + // construct the buffers + inputBuffer := []byte(input) + outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later - outputBuffer := make([]byte, bufferSize) - outputFirstPointer := &outputBuffer[0] - outputPointer := (**C.char)(unsafe.Pointer(&outputFirstPointer)) + // call Convert until all input bytes are read or an error occurs + var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int - // process the source with iconv in a loop - for sourceLeft > 0 { - //fmt.Println("calling to iconv") - _,err := C.iconv(this.context, sourcePointer, &sourceLeft, outputPointer, &outputLeft) + for totalBytesRead < len(inputBuffer) && err == nil { + bytesRead, bytesWritten, err = this.Convert(inputBuffer, outputBuffer) - //fmt.Println("sourceLeft: ", int(sourceLeft), " outputLeft: ", int(outputLeft)) + totalBytesRead += bytesRead + totalBytesWritten += bytesWritten - // check the err - most interested if we need to expand the output buffer - if err != nil { - //fmt.Println("got error value: ", err) + // check for the E2BIG error specifically, we can add to the output + // buffer to correct for it and then continue + if err == E2BIG { + // increase the size of the output buffer by another input length + // first, create a new buffer + tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer)) + + // copy the existing data + copy(tempBuffer, outputBuffer) - if err == E2BIG { - // we need more output buffer to continue - // instead of resizing, lets pull what we got so far - // and set outputLeft back to the buffer size - output += string(outputBuffer[0:bufferSize - int(outputLeft)]) - outputLeft = outputReset - } else { - // we got an error we can't continue with - break - } + // switch the buffers + outputBuffer = tempBuffer + + // forget the error + err = nil } } - // free our sourceBuffer, no longer needed - //C.free(unsafe.Pointer(&sourceBuffer)) + // construct the final output string + output = string(outputBuffer[:totalBytesWritten]) - // convert output buffer a go string - output += string(outputBuffer[0:bufferSize - int(outputLeft)]) - - // free our outputBuffer, no longer needed - //C.free(unsafe.Pointer(&outputBuffer)) - - // return result and any err return output, err } diff --git a/sample.ebcdic-us b/examples/sample.ebcdic-us similarity index 100% rename from sample.ebcdic-us rename to examples/sample.ebcdic-us diff --git a/sample.go b/examples/sample.go similarity index 100% rename from sample.go rename to examples/sample.go diff --git a/sample.utf8 b/examples/sample.utf8 similarity index 100% rename from sample.utf8 rename to examples/sample.utf8 diff --git a/iconv.go b/iconv.go index 1ea4d00..0d65d97 100644 --- a/iconv.go +++ b/iconv.go @@ -15,7 +15,7 @@ var ( E2BIG Error = os.Errno(int(C.E2BIG)) ) -func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err os.Error) { +func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err Error) { // create a new converter converter, err := NewConverter(fromEncoding, toEncoding) @@ -30,7 +30,7 @@ func Convert(input []byte, output []byte, fromEncoding string, toEncoding string return } -func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err os.Error) { +func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err Error) { // create a new converter converter, err := NewConverter(fromEncoding, toEncoding) diff --git a/reader.go b/reader.go index ac77e09..2d9bca1 100644 --- a/reader.go +++ b/reader.go @@ -8,10 +8,9 @@ import ( type Reader struct { source io.Reader converter *Converter - rawBuffer []byte - rawReadPos, rawWritePos int - convertedBuffer []byte - convertedReadPos, convertedWritePos int + buffer []byte + readPos, writePos int + err os.Error } func NewReader(source io.Reader, fromEncoding string, toEncoding string) (*Reader, os.Error) { @@ -34,68 +33,68 @@ func NewReaderFromConverter(source io.Reader, converter *Converter) (reader *Rea reader.converter = converter // create 8K buffers - reader.rawBuffer = make([]byte, 8 * 1024) - reader.convertedBuffer = make([]byte, 8 * 1024) + reader.buffer = make([]byte, 8 * 1024) return reader } -func (this *Reader) fillRawBuffer() { +func (this *Reader) fillBuffer() { // slide existing data to beginning - if this.rawReadPos > 0 { - // copy current bytes - copy(this.rawBuffer, this.rawBuffer[this.rawReadPos:this.rawWritePos]) + if this.readPos > 0 { + // copy current bytes - is this guaranteed safe? + copy(this.buffer, this.buffer[this.readPos:this.writePos]) // adjust positions - this.rawWritePos -= this.rawReadPos - this.rawReadPos = 0 + this.writePos -= this.readPos + this.readPos = 0 } // read new data into buffer at write position - bytesRead, err := this.source.Read(this.rawBuffer[this.rawWritePos:]) + bytesRead, err := this.source.Read(this.buffer[this.writePos:]) // adjust write position - this.rawWritePos += bytesRead + this.writePos += bytesRead - // track source reader errors + // track any reader error / EOF if err != nil { - // not sure where to put this for now - } -} - -func (this *Reader) fillConvertedBuffer() { - // slide existing data to beginning - if this.convertedReadPos > 0 { - // copy current bytes - copy(this.convertedBuffer, this.convertedBuffer[this.convertedReadPos:this.convertedWritePos]) - - // adjust positions - this.convertedWritePos -= this.convertedReadPos - this.convertedReadPos = 0 - } - - // use iconv to fill the converted buffer from the raw buffer - bytesRead, bytesWritten, err := this.converter.Convert(this.rawBuffer[this.rawReadPos:this.rawWritePos], this.convertedBuffer[this.convertedWritePos:]) - - // adjust read and write positions - this.rawReadPos += bytesRead - this.convertedWritePos += bytesWritten - - // track iconv convert errors - if err != nil { - // not sure where to put this for now + this.err = err } } // implement the io.Reader interface func (this *Reader) Read(p []byte) (n int, err os.Error) { - this.fillRawBuffer() - this.fillConvertedBuffer() + // checks for when we have no data + for this.writePos == 0 || this.readPos == this.writePos { + // if we have an error / EOF, just return it + if this.err != nil { + return n, this.err + } - if this.convertedWritePos - 1 > this.convertedReadPos { - // copy converted bytes into p - n = copy(p, this.convertedBuffer[this.convertedReadPos:this.convertedWritePos]) + // else, fill our buffer + this.fillBuffer() + } + + // TODO: checks for when we have less data than len(p) + + // we should have an appropriate amount of data, convert it into the given buffer + bytesRead, bytesWritten, err := this.converter.Convert(this.buffer[this.readPos:this.writePos], p) + + // adjust byte counters + this.readPos += bytesRead + n += bytesWritten + + // if we experienced an iconv error, check it + if err != nil { + // E2BIG errors can be ignored (we'll get them often) as long + // as at least 1 byte was written. If we experienced an E2BIG + // and no bytes were written then the buffer is too small for + // even the next character + if err != E2BIG || bytesWritten == 0 { + // track anything else + this.err = err + } } - return + // return our results + return n, this.err }