From a84994e6e9f96bba259d4a237d4101803c19139b Mon Sep 17 00:00:00 2001 From: Donovan Jimenez Date: Tue, 25 Apr 2017 00:37:24 -0400 Subject: [PATCH] Rework Reader and Writer * add tests that cover same behaviors as Convert and ConvertString * align read and write behaviors with bufio to play nice * add methods that allow to customize buffer size * add methods to reset, allowing reuse --- converter.go | 173 ++++++++-------- iconv_test.go | 530 +++++++++++++++++++++++++++++++++++++++++++------- reader.go | 142 ++++++++------ writer.go | 201 ++++++++++++++----- 4 files changed, 762 insertions(+), 284 deletions(-) diff --git a/converter.go b/converter.go index 0b404e6..30a0409 100644 --- a/converter.go +++ b/converter.go @@ -6,6 +6,7 @@ package iconv #cgo windows LDFLAGS: -liconv #include #include +#include // As of GO 1.6 passing a pointer to Go pointer, will lead to panic // Therofore we use this wrapper function, to avoid passing **char directly from go @@ -20,47 +21,40 @@ import "unsafe" type Converter struct { context C.iconv_t - open bool } // Initialize a new Converter. If fromEncoding or toEncoding are not supported by // iconv then an EINVAL error will be returned. An ENOMEM error maybe returned if // there is not enough memory to initialize an iconv descriptor -func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err error) { - converter = new(Converter) - +func NewConverter(fromEncoding string, toEncoding string) (*Converter, error) { // convert to C strings toEncodingC := C.CString(toEncoding) fromEncodingC := C.CString(fromEncoding) // open an iconv descriptor - converter.context, err = C.iconv_open(toEncodingC, fromEncodingC) + context, err := C.iconv_open(toEncodingC, fromEncodingC) // free the C Strings C.free(unsafe.Pointer(toEncodingC)) C.free(unsafe.Pointer(fromEncodingC)) - // check err - if err == nil { - // no error, mark the context as open - converter.open = true + if err != nil { + return nil, err } - return + return &Converter{context}, nil } -// destroy is called during garbage collection -func (this *Converter) destroy() { - this.Close() +// Close a Converter's iconv descriptor explicitly +func (converter *Converter) Close() error { + _, err := C.iconv_close(converter.context) + return err } -// Close a Converter's iconv description explicitly -func (this *Converter) Close() (err error) { - if this.open { - _, err = C.iconv_close(this.context) - } - - return +// Reset state of iconv context +func (converter *Converter) Reset() error { + _, _, err := converter.Convert(nil, nil) + return err } // Convert bytes from an input byte slice into a give output byte slice @@ -74,95 +68,82 @@ func (this *Converter) Close() (err error) { // For shift based output encodings, any end shift byte sequences can be generated by // passing a 0 length byte slice as input. Also passing a 0 length byte slice for output // will simply reset the iconv descriptor shift state without writing any bytes. -func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err error) { - // make sure we are still open - if this.open { - inputLeft := C.size_t(len(input)) - outputLeft := C.size_t(len(output)) - - if inputLeft > 0 && outputLeft > 0 { - // we have to give iconv a pointer to a pointer of the underlying - // storage of each byte slice - so far this is the simplest - // way i've found to do that in Go, but it seems ugly - inputPointer := (*C.char)(unsafe.Pointer(&input[0])) - outputPointer := (*C.char)(unsafe.Pointer(&output[0])) - - _, err = C.call_iconv(this.context, inputPointer, &inputLeft, outputPointer, &outputLeft) - - // update byte counters - bytesRead = len(input) - int(inputLeft) - bytesWritten = len(output) - int(outputLeft) - } else if inputLeft == 0 && outputLeft > 0 { - // inputPointer will be nil, outputPointer is generated as above - outputPointer := (*C.char)(unsafe.Pointer(&output[0])) - - _, err = C.call_iconv(this.context, nil, &inputLeft, outputPointer, &outputLeft) - - // update write byte counter - bytesWritten = len(output) - int(outputLeft) - } else { - // both input and output are zero length, do a shift state reset - _, err = C.call_iconv(this.context, nil, &inputLeft, nil, &outputLeft) - } - } else { - err = syscall.EBADF +func (converter *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err error) { + inputLeft := C.size_t(len(input)) + outputLeft := C.size_t(len(output)) + + var inputPointer, outputPointer *C.char + + if inputLeft > 0 { + inputPointer = (*C.char)(unsafe.Pointer(&input[0])) + } + + if outputLeft > 0 { + outputPointer = (*C.char)(unsafe.Pointer(&output[0])) } + _, err = C.call_iconv(converter.context, inputPointer, &inputLeft, outputPointer, &outputLeft) + + bytesRead = len(input) - int(inputLeft) + bytesWritten = len(output) - int(outputLeft) + return bytesRead, bytesWritten, err } // Convert an input string // -// EILSEQ error may be returned if input contains invalid bytes for the -// Converter's fromEncoding. -func (this *Converter) ConvertString(input string) (output string, err error) { - // make sure we are still open - if this.open { - // construct the buffers - inputBuffer := []byte(input) - outputBuffer := make([]byte, len(inputBuffer)*2) // we use a larger buffer to help avoid resizing later - - // call Convert until all input bytes are read or an error occurs - var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int - - for totalBytesRead < len(inputBuffer) && err == nil { - // use the totals to create buffer slices - bytesRead, bytesWritten, err = this.Convert(inputBuffer[totalBytesRead:], outputBuffer[totalBytesWritten:]) - - totalBytesRead += bytesRead - totalBytesWritten += bytesWritten - - // check for the E2BIG error specifically, we can add to the output - // buffer to correct for it and then continue - if err == syscall.E2BIG { - // increase the size of the output buffer by another input length - // first, create a new buffer - tempBuffer := make([]byte, len(outputBuffer)+len(inputBuffer)) - - // copy the existing data - copy(tempBuffer, outputBuffer) - - // switch the buffers - outputBuffer = tempBuffer - - // forget the error +// EILSEQ error may be returned if input contains invalid bytes for the Converter's fromEncoding +func (converter *Converter) ConvertString(input string) (output string, err error) { + // construct the buffers + inputBuffer := []byte(input) + outputBuffer := make([]byte, len(inputBuffer)*2) // we use a larger buffer to help avoid resizing later + + // call Convert until all input bytes are read or an error occurs + var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int + + for totalBytesRead < len(inputBuffer) && err == nil { + // use the totals to create buffer slices + bytesRead, bytesWritten, err = converter.Convert(inputBuffer[totalBytesRead:], outputBuffer[totalBytesWritten:]) + + totalBytesRead += bytesRead + totalBytesWritten += bytesWritten + + switch err { + case syscall.E2BIG: + // increase the size of the output buffer by another input length + // first, create a new buffer + tempBuffer := make([]byte, len(outputBuffer)+len(inputBuffer)) + + // copy the existing data + copy(tempBuffer, outputBuffer) + + // switch the buffers + outputBuffer = tempBuffer + + // forget the error + err = nil + case syscall.EILSEQ, syscall.EINVAL: + // iconv can still return these in cases where it still can proceed such as //IGNORE + if bytesRead > 0 || bytesWritten > 0 { err = nil } } + } - if err == nil { - // perform a final shift state reset - _, bytesWritten, err = this.Convert([]byte{}, outputBuffer[totalBytesWritten:]) - - // update total count - totalBytesWritten += bytesWritten - } + if err == nil { + // perform a final shift state reset + _, bytesWritten, err = converter.Convert(nil, outputBuffer[totalBytesWritten:]) - // construct the final output string - output = string(outputBuffer[:totalBytesWritten]) - } else { - err = syscall.EBADF + // update total count + totalBytesWritten += bytesWritten } + // construct the final output string + output = string(outputBuffer[:totalBytesWritten]) + return output, err } + +func finalizeConverter(converter *Converter) { + converter.Close() +} diff --git a/iconv_test.go b/iconv_test.go index fed0869..20e4539 100644 --- a/iconv_test.go +++ b/iconv_test.go @@ -1,6 +1,9 @@ package iconv import ( + "bytes" + "io" + "strings" "syscall" "testing" ) @@ -13,105 +16,486 @@ type iconvTest struct { outputEncoding string bytesRead int bytesWritten int - err error + convertErr error // err from Convert (raw iconv) + err error // err from CovertString, Reader, Writer } -var iconvTests = []iconvTest{ - iconvTest{ - "simple utf-8 to latin1 conversion success", - "Hello World!", "utf-8", - "Hello World!", "latin1", - 12, 12, nil, - }, - iconvTest{ - "invalid source encoding causes EINVAL", - "", "doesnotexist", - "", "utf-8", - 0, 0, syscall.EINVAL, - }, - iconvTest{ - "invalid destination encoding causes EINVAL", - "", "utf-8", - "", "doesnotexist", - 0, 0, syscall.EINVAL, - }, - iconvTest{ - "invalid input sequence causes EILSEQ", - "\xFF", "utf-8", - "", "latin1", - 0, 0, syscall.EILSEQ, - }, - iconvTest{ - "invalid input causes partial output and EILSEQ", - "Hello\xFF", "utf-8", - "Hello", "latin1", - 5, 5, syscall.EILSEQ, - }, +var ( + iconvTests = []iconvTest{ + iconvTest{ + "simple utf-8 to latin1 conversion success", + "Hello World!", "utf-8", + "Hello World!", "latin1", + 12, 12, nil, nil, + }, + iconvTest{ + "invalid source encoding causes EINVAL", + "", "doesnotexist", + "", "utf-8", + 0, 0, syscall.EINVAL, syscall.EINVAL, + }, + iconvTest{ + "invalid destination encoding causes EINVAL", + "", "utf-8", + "", "doesnotexist", + 0, 0, syscall.EINVAL, syscall.EINVAL, + }, + iconvTest{ + "utf-8 to utf-8 passthrough", + "Hello world!", "utf-8", + "Hello world!", "utf-8", + 12, 12, nil, nil, + }, + iconvTest{ + "utf-8 to utf-8 partial", + "Hello\xFFWorld!", "utf-8", + "Hello", "utf-8", + 5, 5, syscall.EILSEQ, syscall.EILSEQ, + }, + iconvTest{ + "utf-8 to utf-8 ignored", + "Hello \xFFWorld!", "utf-8", + "Hello World!", "utf-8//IGNORE", + 13, 12, syscall.EILSEQ, nil, + }, + iconvTest{ + "invalid input sequence causes EILSEQ", + "\xFF", "utf-8", + "", "latin1", + 0, 0, syscall.EILSEQ, syscall.EILSEQ, + }, + iconvTest{ + "incomplete input sequence causes EINVAL", + "\xC2", "utf-8", + "", "latin1", + 0, 0, syscall.EINVAL, syscall.EINVAL, + }, + iconvTest{ + "invalid input causes partial output and EILSEQ", + "Hello\xFF", "utf-8", + "Hello", "latin1", + 5, 5, syscall.EILSEQ, syscall.EILSEQ, + }, + iconvTest{ + "incomplete input causes partial output and EILSEQ", + "Hello\xC2", "utf-8", + "Hello", "latin1", + 5, 5, syscall.EINVAL, syscall.EINVAL, + }, + /* this is only true for glibc / iconv + iconvTest{ + "valid input but no conversion causes EILSEQ", + "你好世界 Hello World", "utf-8", + "", "latin1", + 0, 0, syscall.EILSEQ, syscall.EILSEQ, + },*/ + iconvTest{ + "invalid input with ignore", + "Hello\xFF World!", "utf-8", + "Hello World!", "latin1//IGNORE", + 13, 12, syscall.EILSEQ, nil, + }, + iconvTest{ + "valid input but no conversion with IGNORE", + "你好世界 Hello World", "utf-8", + " Hello World", "latin1//IGNORE", + 24, 12, syscall.EILSEQ, nil, + }, + iconvTest{ + "valid input but no conversion with TRANSLIT", + "你好世界 Hello World", "utf-8", + "???? Hello World", "latin1//TRANSLIT", + 24, 16, nil, nil, + }, + } + + ignoreDetected, translitDetected bool +) + +func init() { + // detect if IGNORE / TRANSLIT is supported (glic / libiconv) + conv, err := NewConverter("utf-8", "ascii//IGNORE") + + if err == nil { + ignoreDetected = true + conv.Close() + } + + conv, err = NewConverter("utf-8", "ascii//TRANSLIT") + + if err == nil { + translitDetected = true + conv.Close() + } } -func TestConvertString(t *testing.T) { +func runTests(t *testing.T, f func(iconvTest, *testing.T) (int, int, string, error)) { for _, test := range iconvTests { - // perform the conversion - output, err := ConvertString(test.input, test.inputEncoding, test.outputEncoding) + t.Run(test.description, func(t *testing.T) { + if !ignoreDetected && strings.HasSuffix(test.outputEncoding, "//IGNORE") { + t.Skip("//IGNORE not supported") + } - // check that output and err match - if output != test.output { - t.Errorf("test \"%s\" failed, output did not match expected", test.description) - } + if !translitDetected && strings.HasSuffix(test.outputEncoding, "//TRANSLIT") { + t.Skip("//TRANSLIT not supported") + } - // check that err is same as expected - if err != test.err { - if test.err != nil { - if err != nil { - t.Errorf("test \"%s\" failed, got %s when expecting %s", test.description, err, test.err) + bytesRead, bytesWritten, output, err := f(test, t) + + // check that bytesRead is same as expected + if bytesRead != test.bytesRead { + t.Errorf("bytesRead: %d expected: %d", bytesRead, test.bytesRead) + } + + // check that bytesWritten is same as expected + if bytesWritten != test.bytesWritten { + t.Errorf("bytesWritten: %d expected: %d", bytesWritten, test.bytesWritten) + } + + // check output bytes against expected + if output != test.output { + t.Errorf("output: %x expected: %x", output, test.output) + } + + // check that err is same as expected + if err != test.err { + if test.err != nil { + if err != nil { + t.Errorf("err: %q expected: %q", err, test.err) + } else { + t.Errorf("err: nil expected %q", test.err) + } } else { - t.Errorf("test \"%s\" failed, got nil when expecting %s", test.description, test.err) + t.Errorf("unexpected error: %q", err) } - } else { - t.Errorf("test \"%s\" failed, got unexpected error: %s", test.description, err) } - } + }) } } func TestConvert(t *testing.T) { - for _, test := range iconvTests { - // setup input buffer + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { input := []byte(test.input) - - // setup a buffer as large as the expected bytesWritten output := make([]byte, 50) // peform the conversion bytesRead, bytesWritten, err := Convert(input, output, test.inputEncoding, test.outputEncoding) - // check that bytesRead is same as expected - if bytesRead != test.bytesRead { - t.Errorf("test \"%s\" failed, bytesRead did not match expected", test.description) + // HACK Convert has different erorrs, so check ourselves, and then fake out later check + if err != test.convertErr { + if test.convertErr != nil { + if err != nil { + t.Errorf("err: %q expected: %q", err, test.convertErr) + } else { + t.Errorf("err: nil expected %q", test.convertErr) + } + } else { + t.Errorf("unexpected error: %q", err) + } } + err = test.err - // check that bytesWritten is same as expected - if bytesWritten != test.bytesWritten { - t.Errorf("test \"%s\" failed, bytesWritten did not match expected", test.description) - } + return bytesRead, bytesWritten, string(output[:bytesWritten]), err + }) +} - // check output bytes against expected - simplest to convert output to - // string and then do an equality check which is actually a byte wise operation - if string(output[:bytesWritten]) != test.output { - t.Errorf("test \"%s\" failed, output did not match expected", test.description) - } +func TestConvertString(t *testing.T) { + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { + // perform the conversion + output, err := ConvertString(test.input, test.inputEncoding, test.outputEncoding) - // check that err is same as expected - if err != test.err { - if test.err != nil { - if err != nil { - t.Errorf("test \"%s\" failed, got %s when expecting %s", test.description, err, test.err) - } else { - t.Errorf("test \"%s\" failed, got nil when expecting %s", test.description, test.err) + // bytesRead and bytesWritten are spoofed a little + return test.bytesRead, len(output), output, err + }) +} + +func TestReader(t *testing.T) { + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { + var bytesRead, bytesWritten, finalBytesWritten int + var err error + + input := bytes.NewBufferString(test.input) + output := make([]byte, 50) + + reader, err := NewReader(input, test.inputEncoding, test.outputEncoding) + + if err == nil { + bytesWritten, err = reader.Read(output) + + // we can compute how many bytes iconv read by inspecting the reader state + bytesRead = len([]byte(test.input)) - input.Len() - (reader.writePos - reader.readPos) + + // with current tests and buffer sizes, we'd expect all input to be buffered if we called read + if input.Len() != 0 { + t.Error("not all bytes from input were buffered") + } + + // do final read test if we can - either get EOF or same test error + if err == nil { + finalBytesWritten, err = reader.Read(output[bytesWritten:]) + + if finalBytesWritten != 0 { + t.Errorf("finalBytesWritten: %d expected: 0", finalBytesWritten) + } + + if err == io.EOF { + err = nil } - } else { - t.Errorf("test \"%s\" failed, got unexpected error: %s", test.description, err) } } + + return bytesRead, bytesWritten, string(output[:bytesWritten]), err + }) +} + +func TestWriter(t *testing.T) { + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { + var bytesRead, bytesWritten int + var err error + + input := []byte(test.input) + output := new(bytes.Buffer) + + writer, err := NewWriter(output, test.inputEncoding, test.outputEncoding) + + if err == nil { + bytesRead, err = writer.Write(input) + bytesRead -= writer.readPos + writer.Close() + + bytesWritten = output.Len() + } + + return bytesRead, bytesWritten, output.String(), err + }) +} + +func TestReaderWithCopy(t *testing.T) { + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { + input := bytes.NewBufferString(test.input) + output := new(bytes.Buffer) + + reader, err := NewReader(input, test.inputEncoding, test.outputEncoding) + + if err == nil { + _, err := io.Copy(output, reader) + + bytesRead := len(test.input) - input.Len() - reader.writePos + bytesWritten := output.Len() + + return bytesRead, bytesWritten, output.String(), err + } + + return 0, 0, output.String(), err + }) +} + +func TestWriterWithCopy(t *testing.T) { + runTests(t, func(test iconvTest, t *testing.T) (int, int, string, error) { + input := bytes.NewBufferString(test.input) + output := new(bytes.Buffer) + + writer, err := NewWriter(output, test.inputEncoding, test.outputEncoding) + + if err == nil { + bytesCopied, err := io.Copy(writer, input) + bytesRead := int(bytesCopied) - writer.readPos + writer.Close() + + bytesWritten := output.Len() + + return bytesRead, bytesWritten, output.String(), err + } + + return 0, 0, output.String(), err + }) +} + +func TestReaderMultipleReads(t *testing.T) { + // setup a source reader and our expected output string + source := bytes.NewBufferString("\x80\x8A\x99\x95\x8B\x86\x87") + expected := "€Š™•‹†‡" + + // setup reader - use our minimum buffer size so we can force it to shuffle the buffer around + reader, err := NewReaderSized(source, "cp1252", "utf-8", minReadBufferSize) + + if err != nil { + if err == syscall.EINVAL { + t.Skip("Either cp1252 or utf-8 isn't supported by iconv on your system") + } else { + t.Fatalf("Unexpected error when creating reader: %s", err) + } + } + + // setup a read buffer - we'll slice it to different sizes in our tests + buffer := make([]byte, 64) + + // first read should fill internal buffer, but we'll only read part of it + bytesRead, err := reader.Read(buffer[:5]) + + if bytesRead != 5 || err != nil { + t.Fatalf("first read did not give expected 5, nil: %d, %s", bytesRead, err) + } + + // because of how small teh source is and our minimum buffer size, source shoudl be fully read + if source.Len() != 0 { + t.Fatalf("first read did not buffer all of source like expected: %d bytes remain", source.Len()) + } + + // Buffer doesn't return EOF with last bytes, reader shouldn't know its EOF yet + if reader.eof { + t.Fatalf("first read was not expected to receive EOF") + } + + // second read should shift internal buffer, and fill again - make buffer too small for last utf-8 character + // E2BIG from iconv should be ignored because we wrote at least 1 byte + bytesRead, err = reader.Read(buffer[5:18]) + + if bytesRead != 12 || err != nil { + t.Fatalf("second read did not give expected 15, nil: %d, %s", bytesRead, err) + } + + if !reader.eof { + t.Fatalf("second read did not put reader into eof state") + } + + // try to read the last 3 byte character with only a buffer of 2 bytes - this time we should see the E2BIG + bytesRead, err = reader.Read(buffer[17:19]) + + if bytesRead != 0 || err != syscall.E2BIG { + t.Fatalf("third read did not give expected 0, E2BIG: %d, %s", bytesRead, err) + } + + // fourth read should finish last character + bytesRead, err = reader.Read(buffer[17:]) + + if bytesRead != 3 || err != nil { + t.Fatalf("fourth read did not give expected 3, nil: %d, %s", bytesRead, err) + } + + // last read should be EOF + bytesRead, err = reader.Read(buffer[20:]) + + if bytesRead != 0 || err != io.EOF { + t.Fatalf("final read did not give expected 0, EOF: %d, %s", bytesRead, err) + } + + // check full utf-8 output + if string(buffer[:20]) != expected { + t.Fatalf("output did not match expected %q: %q", expected, string(buffer[:20])) + } +} + +func TestWriteWithIncompleteSequence(t *testing.T) { + expected := "\x80\x8A\x99\x95\x8B\x86\x87" + input := []byte("€Š™•‹†‡") + output := new(bytes.Buffer) + + writer, err := NewWriter(output, "utf-8", "cp1252") + + if err != nil { + t.Fatalf("unexpected error while creating writer %q", err) + } + + // the input string is made of 3 byte characters, for the test we want to only write part of the last character + bytesFromBuffer := len(input) - 2 + + bytesRead, err := writer.Write(input[:bytesFromBuffer]) + + if bytesRead != bytesFromBuffer { + t.Fatalf("did a short write on first write: %d, %s", bytesRead, err) + } + + // finish the rest + bytesRead, err = writer.Write(input[bytesFromBuffer:]) + + if bytesRead != 2 { + t.Fatalf("did a short write on second write: %d, %s", bytesRead, err) + } + + err = writer.Close() + actual := output.String() + + if err != nil { + t.Errorf("got an error on close: %s", err) + } + + if actual != expected { + t.Errorf("output %x did not match expected %x", actual, expected) + } +} + +func TestWriteWithIncompleteSequenceAndIgnore(t *testing.T) { + if !ignoreDetected { + t.Skip("//IGNORE not supported") + } + + expected := "\x80\x8A\x99\x95\x8B\x86\x87" + input := []byte("€Š™•‹†‡") + output := new(bytes.Buffer) + + writer, err := NewWriter(output, "utf-8", "cp1252//IGNORE") + + if err != nil { + t.Fatalf("unexpected error while creating writer %q", err) + } + + // the input string is made of 3 byte characters, for the test we want to only write part of the last character + bytesFromBuffer := len(input) - 2 + + bytesRead, err := writer.Write(input[:bytesFromBuffer]) + + if bytesRead != bytesFromBuffer { + t.Fatalf("did a short write on first write: %d, %s", bytesRead, err) + } + + // finish the rest + bytesRead, err = writer.Write(input[bytesFromBuffer:]) + + if bytesRead != 2 { + t.Fatalf("did a short write on second write: %d, %s", bytesRead, err) + } + + err = writer.Close() + actual := output.String() + + if err != nil { + t.Errorf("got an error on close: %s", err) + } + + if actual != expected { + t.Errorf("output %x did not match expected %x", actual, expected) + } +} + +func TestWriteWithIncompleteSequenceAtEOF(t *testing.T) { + expected := "\x80\x8A\x99\x95\x8B\x86" + input := []byte("€Š™•‹†‡") + output := new(bytes.Buffer) + + writer, err := NewWriter(output, "utf-8", "cp1252") + + if err != nil { + t.Fatalf("unexpected error while creating writer %q", err) + } + + // the input string is made of 3 byte characters, for the test we want to only write part of the last character + bytesFromBuffer := len(input) - 2 + + bytesRead, err := writer.Write(input[:bytesFromBuffer]) + + if bytesRead != bytesFromBuffer { + t.Fatalf("did a short write on first write: %d, %s", bytesRead, err) + } + + err = writer.Close() + actual := output.String() + + if err != nil { + t.Errorf("got an error on close: %s", err) + } + + if actual != expected { + t.Errorf("output %x did not match expected %x", actual, expected) } } diff --git a/reader.go b/reader.go index 2835ce6..472d55c 100644 --- a/reader.go +++ b/reader.go @@ -2,7 +2,12 @@ package iconv import ( "io" - "syscall" + "runtime" +) + +const ( + defaultReadBufferSize = 8 * 1024 + minReadBufferSize = 16 ) type Reader struct { @@ -10,91 +15,100 @@ type Reader struct { converter *Converter buffer []byte readPos, writePos int - err error + eof bool } -func NewReader(source io.Reader, fromEncoding string, toEncoding string) (*Reader, error) { - // create a converter - converter, err := NewConverter(fromEncoding, toEncoding) - - if err == nil { - return NewReaderFromConverter(source, converter), err - } +func NewReader(source io.Reader, fromEncoding, toEncoding string) (*Reader, error) { + return NewReaderSized(source, fromEncoding, toEncoding, defaultReadBufferSize) +} - // return the error - return nil, err +func NewReaderFromConverter(source io.Reader, converter *Converter) *Reader { + return NewReaderFromConverterSized(source, converter, defaultReadBufferSize) } -func NewReaderFromConverter(source io.Reader, converter *Converter) (reader *Reader) { - reader = new(Reader) +func NewReaderSized(source io.Reader, fromEncoding, toEncoding string, size int) (*Reader, error) { + converter, err := NewConverter(fromEncoding, toEncoding) - // copy elements - reader.source = source - reader.converter = converter + if err != nil { + return nil, err + } - // create 8K buffers - reader.buffer = make([]byte, 8*1024) + // add a finalizer for the converter we created + runtime.SetFinalizer(converter, finalizeConverter) - return reader + return NewReaderFromConverterSized(source, converter, size), nil } -func (this *Reader) fillBuffer() { - // slide existing data to beginning - if this.readPos > 0 { - // copy current bytes - is this guaranteed safe? - copy(this.buffer, this.buffer[this.readPos:this.writePos]) - - // adjust positions - this.writePos -= this.readPos - this.readPos = 0 +func NewReaderFromConverterSized(source io.Reader, converter *Converter, size int) *Reader { + if size < minReadBufferSize { + size = minReadBufferSize } - // read new data into buffer at write position - bytesRead, err := this.source.Read(this.buffer[this.writePos:]) - - // adjust write position - this.writePos += bytesRead - - // track any reader error / EOF - if err != nil { - this.err = err + return &Reader{ + source: source, + converter: converter, + buffer: make([]byte, size), } } -// implement the io.Reader interface -func (this *Reader) Read(p []byte) (n int, err error) { - // checks for when we have no data - for this.writePos == 0 || this.readPos == this.writePos { - // if we have an error / EOF, just return it - if this.err != nil { - return n, this.err +func (r *Reader) Read(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + + var bytesRead, bytesWritten int + var err error + + // setup for a single read into buffer if possible + if !r.eof { + if r.readPos > 0 { + // slide data to front of buffer + r.readPos, r.writePos = 0, copy(r.buffer, r.buffer[r.readPos:r.writePos]) } - // else, fill our buffer - this.fillBuffer() - } + if r.writePos < len(r.buffer) { + // do the single read + bytesRead, err = r.source.Read(r.buffer[r.writePos:]) - // TODO: checks for when we have less data than len(p) + if bytesRead < 0 { + panic("iconv: source reader returned negative count from Read") + } - // we should have an appropriate amount of data, convert it into the given buffer - bytesRead, bytesWritten, err := this.converter.Convert(this.buffer[this.readPos:this.writePos], p) + r.writePos += bytesRead + r.eof = err == io.EOF + } + } - // adjust byte counters - this.readPos += bytesRead - n += bytesWritten + if r.readPos < r.writePos || r.eof { + // convert any buffered data we have, or do a final reset (for shift based conversions) + bytesRead, bytesWritten, err = r.converter.Convert(r.buffer[r.readPos:r.writePos], p) + r.readPos += bytesRead - // if we experienced an iconv error, check it - if err != nil { - // E2BIG errors can be ignored (we'll get them often) as long - // as at least 1 byte was written. If we experienced an E2BIG - // and no bytes were written then the buffer is too small for - // even the next character - if err != syscall.E2BIG || bytesWritten == 0 { - // track anything else - this.err = err + // if we experienced an iconv error and didn't make progress, report it. + // if we did make progress, it may be informational only (i.e. reporting + // an EILSEQ even when using //ignore to skip them) + if err != nil && bytesWritten == 0 { + return bytesWritten, err } + + // signal an EOF only if we didn't write anything - accomodates premature + // errror checking in user code + if bytesWritten == 0 && r.eof { + return 0, io.EOF + } + + return bytesWritten, nil } - // return our results - return n, this.err + return 0, err +} + +func (r *Reader) Reset(source io.Reader) { + r.converter.Reset() + + *r = Reader{ + source: source, + converter: r.converter, + buffer: r.buffer, + } } diff --git a/writer.go b/writer.go index db007eb..29d3c85 100644 --- a/writer.go +++ b/writer.go @@ -1,82 +1,181 @@ package iconv -import "io" +import ( + "io" + "runtime" + "syscall" +) + +const ( + defaultWriteBufferSize = 8 * 1024 + minWriteBufferSize = 16 +) type Writer struct { - destination io.Writer - converter *Converter - buffer []byte - readPos, writePos int - err error + destination io.Writer + converter *Converter + readBuffer, writeBuffer []byte + readPos, writePos int } func NewWriter(destination io.Writer, fromEncoding string, toEncoding string) (*Writer, error) { - // create a converter + return NewWriterSized(destination, fromEncoding, toEncoding, defaultWriteBufferSize) +} + +func NewWriterFromConverter(destination io.Writer, converter *Converter) (writer *Writer) { + return NewWriterFromConverterSized(destination, converter, defaultWriteBufferSize) +} + +func NewWriterSized(destination io.Writer, fromEncoding, toEncoding string, size int) (*Writer, error) { converter, err := NewConverter(fromEncoding, toEncoding) - if err == nil { - return NewWriterFromConverter(destination, converter), err + if err != nil { + return nil, err } - // return the error - return nil, err + // add a finalizer for the converter we created + runtime.SetFinalizer(converter, finalizeConverter) + + return NewWriterFromConverterSized(destination, converter, size), nil } -func NewWriterFromConverter(destination io.Writer, converter *Converter) (writer *Writer) { - writer = new(Writer) +func NewWriterFromConverterSized(destination io.Writer, converter *Converter, size int) *Writer { + if size < minWriteBufferSize { + size = minWriteBufferSize + } - // copy elements - writer.destination = destination - writer.converter = converter + return &Writer{ + destination: destination, + converter: converter, + readBuffer: make([]byte, size), + writeBuffer: make([]byte, size), + } +} - // create 8K buffers - writer.buffer = make([]byte, 8*1024) +// Implements io.Writer +// +// Will attempt to convert all of p into buffer. If there's not enough room in +// the buffer to hold all converted bytes, the buffer will be flushed and p will +// continue to be processed. Close should be called on a writer when finished +// with all writes, to ensure final shift sequences are written and buffer is +// flushed to underlying io.Writer. +// +// Can return all errors that Convert can, as well as any errors from Flush. Note +// that some errors from Convert are suppressed if we continue making progress +// on p. +func (w *Writer) Write(p []byte) (int, error) { + var totalBytesRead, bytesRead, bytesWritten int + var err error + + if w.readPos == 0 || len(p) == 0 { + bytesRead, bytesWritten, err = w.converter.Convert(p, w.writeBuffer[w.writePos:]) + totalBytesRead += bytesRead + w.writePos += bytesWritten + w.readPos = 0 + } else { + // we have left over bytes from previous write that weren't complete and there's at least + // one byte being written, fill read buffer with p and try to convert, if we make progress + // we can continue conversion from p itself + bytesCopied := copy(w.readBuffer[w.readPos:], p) + + bytesRead, bytesWritten, err = w.converter.Convert(w.readBuffer[:w.readPos+bytesCopied], w.writeBuffer[w.writePos:]) + + // if we made no progress, give up + if bytesRead <= w.readPos { + return 0, err + } - return writer -} + bytesRead -= w.readPos + totalBytesRead += bytesRead -func (this *Writer) emptyBuffer() { - // write new data out of buffer - bytesWritten, err := this.destination.Write(this.buffer[this.readPos:this.writePos]) + w.readPos = 0 + w.writePos += bytesWritten + } - // update read position - this.readPos += bytesWritten + // try to process all of p - lots of io functions don't like short writes. + // + // There are a few error cases we need to treat specially, as long as we've + // made progress on p, E2BIG and EILSEQ should not be fatal. EINVAL isn't + // fatal as long as the rest of p fits in our buffers. + for err != nil && bytesRead > 0 { + switch err { + case syscall.E2BIG: + err = w.Flush() + + case syscall.EILSEQ: + // IGNORE suffix still reports the error on convert + err = nil + + // if no more bytes, don't do an empty convert (resets state) + if totalBytesRead == len(p) { + break + } + + case syscall.EINVAL: + // if the rest of p fits in read buffer copy it there + if len(p[totalBytesRead:]) <= len(w.readBuffer) { + w.readPos = copy(w.readBuffer, p[totalBytesRead:]) + totalBytesRead += w.readPos + break + } + } - // slide existing data to beginning - if this.readPos > 0 { - // copy current bytes - is this guaranteed safe? - copy(this.buffer, this.buffer[this.readPos:this.writePos]) + // if not an ignoreable err or Flush err + if err != nil { + break + } - // adjust positions - this.writePos -= this.readPos - this.readPos = 0 + bytesRead, bytesWritten, err = w.converter.Convert(p[totalBytesRead:], w.writeBuffer[w.writePos:]) + totalBytesRead += bytesRead + w.writePos += bytesWritten } - // track any reader error / EOF - if err != nil { - this.err = err - } + return totalBytesRead, err } -// implement the io.Writer interface -func (this *Writer) Write(p []byte) (n int, err error) { - // write data into our internal buffer - bytesRead, bytesWritten, err := this.converter.Convert(p, this.buffer[this.writePos:]) +// Attempt to write any buffered data to destination writer. Returns error from +// Write call or io.ErrShortWrite if Write didn't report an error but also didn't +// accept all bytes given. +func (w *Writer) Flush() error { + if w.readPos < w.writePos { + bytesWritten, err := w.destination.Write(w.writeBuffer[:w.writePos]) - // update bytes written for return - n += bytesRead - this.writePos += bytesWritten + if bytesWritten < 0 { + panic("iconv: writer returned negative count from Write") + } + + if bytesWritten > 0 { + w.writePos = copy(w.writeBuffer, w.writeBuffer[bytesWritten:w.writePos]) + } - // checks for when we have a full buffer - for this.writePos > 0 { - // if we have an error, just return it - if this.err != nil { - return + if err == nil && w.writePos > 0 { + err = io.ErrShortWrite } - // else empty the buffer - this.emptyBuffer() + return err } - return n, err + return nil +} + +// Perform a final write with empty buffer, which allows iconv to close any shift +// sequences. A Flush is performed if needed. +func (w *Writer) Close() error { + _, err := w.Write(nil) + if err != nil { + return err + } + return w.Flush() +} + +// Reset state and direct writes to a new destination writer +func (w *Writer) Reset(destination io.Writer) { + w.converter.Reset() + + *w = Writer{ + destination: destination, + converter: w.converter, + readBuffer: w.readBuffer, + writeBuffer: w.writeBuffer, + } }