Improving documentation and including shift reset logic

2011-01-29 01:31:00 -05:00 · 2011-01-29 01:31:00 -05:00 · 5ea739d3eb
commit 5ea739d3eb
parent 20aa6d93c3
4 changed files with 206 additions and 104 deletions
--- a/10
+++ b/10
@ -1,3 +1,4 @@
+# standard GO make file preamble
 include $(GOROOT)/src/Make.inc

 # target package name
@ -6,10 +7,17 @@ TARG=iconv
 # regular go files
 GOFILES=\
 	reader.go\
+	writer.go\

 # files that must be processed by cgo
 CGOFILES=\
 	converter.go\
 	iconv.go\

-include $(GOROOT)/src/Make.pkg
+# on non glibc systems, we usually need to load the library
+ifneq ($(GOOS),linux)
+CGO_LDFLAGS=-liconv
+endif
+
+# standard GO make file include for packages
+include $(GOROOT)/src/Make.pkg
--- a/README.md
+++ b/README.md
@ -1,72 +1,115 @@
-Install
-=======
+# Install

-The goinstall command can be used:
-
-	goinstall github.com/djimenez/iconv.go
-
-Or, you can clone the repository and use gomake instead
+The main method of installation is through gomake (provided in $GOROOT/bin)

 	git clone git://github.com/djimenez/iconv.go.git iconv
 	cd iconv
 	gomake install

-Usage
-=====
+Alternatively, you can try using goinstall (also provided in $GOROOT/bin).
+However, because iconv.go uses cgo to wrap iconv functions, the build may not
+succeed on all systems. At time of writing goinstall was still experimental and
+has known issues with cgo based packages because of how it produces its own
+make file.
+
+	goinstall github.com/djimenez/iconv.go
+
+# Usage

 To use the package, you'll need the appropriate import statement:

 	import (
-		// if you used goinstall, you'll want this import
-		iconv "github.com/djimenez/iconv.go"
-
 		// if you used gomake install directly, you'll want this import
 		iconv
+		
+		// if you used goinstall, you'll want this import
+		iconv "github.com/djimenez/iconv.go"
 	)

-Converting string Values 
------------------------
+## Converting string Values 

-Converting a string can be done with two methods. First, there's iconv.ConvertString(input, fromEncoding, toEncoding string)
+Converting a string can be done with two methods. First, there's
+iconv.ConvertString(input, fromEncoding, toEncoding string)

 	output,_ := iconv.ConvertString("Hello World!", "utf-8", "windows-1252")

-Alternatively, you can create a converter and use its ConvertString method. This mostly just saves having to parse the from and to encodings when converting many strings in the same way.
+Alternatively, you can create a converter and use its ConvertString method.
+Reuse of a Converter instance is recommended when doing many string conversions
+between the same encodings.

 	converter := iconv.NewConverter("utf-8", "windows-1252")
 	output,_ := converter.ConvertString("Hello World!")
-
-Converting []byte Values
------------------------
-
-Converting a []byte can similarly be done with two methods. First, there's iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll immediately notice this requires you to give it both the input and output buffer. Ideally, the output buffer should be sized so that it can hold all converted bytes from input, but if it cannot, then Convert will put as many bytes as it can into the buffer without creating an invalid sequence. For example, if iconv only has a single byte left in the output buffer but needs 2 or more for the complete character in a multibyte encoding it will stop writing to the buffer and return with an iconv.E2BIG error.
-
-	input := []byte("Hello World!")
-	output := make([]byte, len(input))
 	
-	bytesRead, bytesWritten, error := iconv.Convert(input, output, "utf-8", "windows-1252")
+	// converter can then be closed explicitly
+	// this will also happen when garbage collected
+	converter.Close()

-Just like with ConvertString, there is also a Convert method on Converter that can be used.
+ConvertString may return errors for the following reasons:
+
+ * EINVAL - when either the from or to encoding is not supported by iconv
+ * EILSEQ - when the input string contains an invalid byte sequence for the
+   given from encoding
+
+## Converting []byte Values
+
+Converting a []byte can similarly be done with two methods. First, there's
+iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll
+immediately notice this requires you to give it both the input and output
+buffer. Ideally, the output buffer should be sized so that it can hold all
+converted bytes from input, but if it cannot, then Convert will put as many
+bytes as it can into the buffer without creating an invalid sequence. For
+example, if iconv only has a single byte left in the output buffer but needs 2
+or more for the complete character in a multibyte encoding it will stop writing
+to the buffer and return with an iconv.E2BIG error.
+
+	in := []byte("Hello World!")
+	out := make([]byte, len(input))
+	
+	bytesRead, bytesWritten, err := iconv.Convert(in, out, "utf-8", "latin1")
+
+Just like with ConvertString, there is also a Convert method on Converter that
+can be used.

 	...
 	converter := iconv.NewConverter("utf-8", "windows-1252")
 	
 	bytesRead, bytesWritten, error := converter.Convert(input, output)
+	
+Convert may return errors for the following reasons:

-Converting an *io.Reader
------------------------
+ * EINVAL - when either the from or to encoding is not supported by iconv
+ * EILSEQ - when the input string contains an invalid byte sequence for the
+   given from encoding
+ * E2BIG - when the output buffer is not big enough to hold the full
+   conversion of input
+   
+   Note on E2BIG: this is a common error value especially when converting to a
+   multibyte encoding and should not be considered fatal. Partial conversion
+   has probably occurred be sure to check bytesRead and bytesWritten.

-The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes transcoded as they are read. 
+### Note on Shift Based Encodings

-	// We're wrapping stdin for simplicity, but a File or network reader could be wrapped as well
+When using iconv.Convert convenience method it will automatically try to append
+to your output buffer with a nil input so that any end shift sequences are
+appropiately written. Using a Converter.Convert method however will not
+automatically do this since it can be used to process a full stream in chunks.
+So you'll need to remember to pass a nil input buffer at the end yourself, just
+like you would with direct iconv usage.
+
+## Converting an *io.Reader
+
+The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes
+transcoded as they are read. 
+
+	// We're wrapping stdin for simplicity, but a File or network reader could
+	// be wrapped as well
 	reader,_ := iconv.NewReader(os.Stdin, "utf-8", "windows-1252")

-Converting an *io.Writer
------------------------
+## Converting an *io.Writer

-To be written.
+The iconv.Writer allows any other *io.Writer to be wrapped and have its bytes
+transcoded as they are written. 

-Piping a Conversion
-------------------
-
-To be written.
+	// We're wrapping stdout for simplicity, but a File or network reader could
+	// be wrapped as well
+	writer,_ := iconv.NewWriter(os.Stdout, "utf-8", "windows-1252")
--- a/converter.go
+++ b/converter.go
@ -1,25 +1,25 @@
 package iconv

 /*
-#include <iconv.h>
 #include <stdlib.h>
+#include <iconv.h>
 */
 import "C"
-
-import (
-	"os"
-	"unsafe"
-)
+import "os"
+import "unsafe"

 type Converter struct {
 	context C.iconv_t
 	open bool
 }

+// Initialize a new Converter. If fromEncoding or toEncoding are not supported by
+// iconv then an EINVAL error will be returned. An ENOMEM error maybe returned if
+// there is not enough memory to initialize an iconv descriptor
 func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err Error) {
 	converter = new(Converter)

-	// create C strings
+	// convert to C strings
 	toEncodingC := C.CString(toEncoding)
 	fromEncodingC := C.CString(fromEncoding)

@ -39,12 +39,12 @@ func NewConverter(fromEncoding string, toEncoding string) (converter *Converter,
 	return
 }

-// Called before garbage collection
+// destroy is called during garbage collection
 func (this *Converter) destroy() {
 	this.Close()
 }

-// The converter can be explicitly closed if desired
+// Close a Converter's iconv description explicitly
 func (this *Converter) Close() (err os.Error) {
 	if this.open {
 		_, err = C.iconv_close(this.context)
@ -53,73 +53,106 @@ func (this *Converter) Close() (err os.Error) {
 	return
 }

-// read bytes from an input buffer, and write them to and output buffer
-// will return the number of bytesRead from the input and the number of bytes
-// written to the output as well as any iconv errors
+// Convert bytes from an input byte slice into a give output byte slice
 //
-// NOTE: not all bytes may be consumed from the input. This can be because the output
-// buffer is too small or because there were iconv errors
+// As many bytes that can converted and fit into the size of output will be
+// processed and the number of bytes read for input as well as the number of
+// bytes written to output will be returned. If not all converted bytes can fit
+// into output and E2BIG error will also be returned. If input contains an invalid
+// sequence of bytes for the Converter's fromEncoding an EILSEQ error will be returned
+//
+// For shift based output encodings, any end shift byte sequences can be generated by
+// passing a 0 length byte slice as input. Also passing a 0 length byte slice for output
+// will simply reset the iconv descriptor shift state without writing any bytes.
 func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err Error) {
-	inputLeft := C.size_t(len(input))
-	outputLeft := C.size_t(len(output))
-	
-	if inputLeft > 0 && outputLeft > 0 {
-		// we're going to give iconv the pointers to the underlying
-		// storage of each byte slice - so far this is the simplest
-		// way i've found to do that in Go, but it seems ugly
-		inputFirstElementPointer := &input[0]
-		inputPointer := (**C.char)(unsafe.Pointer(&inputFirstElementPointer))
+	// make sure we are still open
+	if this.open {
+		inputLeft := C.size_t(len(input))
+		outputLeft := C.size_t(len(output))

-		outputFirstElementPointer := &output[0]
-		outputPointer := (**C.char)(unsafe.Pointer(&outputFirstElementPointer))
+		if inputLeft > 0 && outputLeft > 0 {
+			// we have to give iconv a pointer to a pointer of the underlying
+			// storage of each byte slice - so far this is the simplest
+			// way i've found to do that in Go, but it seems ugly
+			inputPointer := (*C.char)(unsafe.Pointer(&input[0]))
+			outputPointer := (*C.char)(unsafe.Pointer(&output[0]))

-		// we're only going to make one call to iconv
-		_,err = C.iconv(this.context, inputPointer, &inputLeft, outputPointer, &outputLeft)
+			_,err = C.iconv(this.context, &inputPointer, &inputLeft, &outputPointer, &outputLeft)

-		// update byte counters
-		bytesRead = len(input) - int(inputLeft)
-		bytesWritten = len(output) - int(outputLeft)
+			// update byte counters
+			bytesRead = len(input) - int(inputLeft)
+			bytesWritten = len(output) - int(outputLeft)
+		} else if inputLeft == 0 && outputLeft > 0 {
+			// inputPointer will be nil, outputPointer is generated as above
+			outputPointer := (*C.char)(unsafe.Pointer(&output[0]))
+
+			_,err = C.iconv(this.context, nil, &inputLeft, &outputPointer, &outputLeft)
+
+			// update write byte counter
+			bytesWritten = len(output) - int(outputLeft)
+		} else {
+			// both input and output are zero length, do a shift state reset
+			_,err = C.iconv(this.context, nil, &inputLeft, nil, &outputLeft)
+		}
+	} else {
+		err = EBADF
 	}
-	
+
 	return bytesRead, bytesWritten, err
 }

-// convert a string value, returning a new string value
+// Convert an input string
+//
+// EILSEQ error may be returned if input contains invalid bytes for the
+// Converter's fromEncoding.
 func (this *Converter) ConvertString(input string) (output string, err Error) {
+	// make sure we are still open
+	if this.open {
+		// construct the buffers
+		inputBuffer := []byte(input)
+		outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later

-	// construct the buffers
-	inputBuffer := []byte(input)
-	outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later
+		// call Convert until all input bytes are read or an error occurs
+		var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int

-	// call Convert until all input bytes are read or an error occurs
-	var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int
+		for totalBytesRead < len(inputBuffer) && err == nil {
+			// use the totals to create buffer slices
+			bytesRead, bytesWritten, err = this.Convert(inputBuffer[totalBytesRead:], outputBuffer[totalBytesWritten:])

-	for totalBytesRead < len(inputBuffer) && err == nil {
-		bytesRead, bytesWritten, err = this.Convert(inputBuffer, outputBuffer)
+			totalBytesRead += bytesRead
+			totalBytesWritten += bytesWritten

-		totalBytesRead += bytesRead
-		totalBytesWritten += bytesWritten
-
-		// check for the E2BIG error specifically, we can add to the output
-		// buffer to correct for it and then continue
-		if err == E2BIG {
-			// increase the size of the output buffer by another input length
-			// first, create a new buffer
-			tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
+			// check for the E2BIG error specifically, we can add to the output
+			// buffer to correct for it and then continue
+			if err == E2BIG {
+				// increase the size of the output buffer by another input length
+				// first, create a new buffer
+				tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
 			
-			// copy the existing data
-			copy(tempBuffer, outputBuffer)
+				// copy the existing data
+				copy(tempBuffer, outputBuffer)

-			// switch the buffers
-			outputBuffer = tempBuffer
+				// switch the buffers
+				outputBuffer = tempBuffer

-			// forget the error
-			err = nil
+				// forget the error
+				err = nil
+			}
 		}
+		
+		if err == nil {
+			// perform a final shift state reset
+			_, bytesWritten, err = this.Convert([]byte{}, outputBuffer[totalBytesWritten:])
+			
+			// update total count
+			totalBytesWritten += bytesWritten
+		}
+		
+		// construct the final output string
+		output = string(outputBuffer[:totalBytesWritten])
+	} else {
+		err = EBADF
 	}

-	// construct the final output string
-	output = string(outputBuffer[:totalBytesWritten])
-
 	return output, err
 }
--- a/iconv.go
+++ b/iconv.go
@ -1,30 +1,47 @@
+/*
+Wraps the iconv API present on most systems, which allows for conversion
+of bytes from one encoding to another. This package additionally provides
+some convenient interface implementations like a Reader and Writer.
+*/
 package iconv

 /*
 #include <errno.h>
 */
 import "C"
+import "os"

-import (
-	"os"
-)
-
-// allows us to check for iconv specific errors
+// Alias os.Error for convenience
 type Error os.Error

+// Error codes returned from iconv functions
 var (
-	EILSEQ Error = os.Errno(int(C.EILSEQ))
 	E2BIG Error = os.Errno(int(C.E2BIG))
+	EBADF Error = os.Errno(int(C.EBADF))
+	EINVAL Error = os.Errno(int(C.EINVAL))
+	EILSEQ Error = os.Errno(int(C.EILSEQ))
+	ENOMEM Error = os.Errno(int(C.ENOMEM))
 )

+// All in one Convert method, rather than requiring the construction of an iconv.Converter
 func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err Error) {
-	// create a new converter
+	// create a temporary converter
 	converter, err := NewConverter(fromEncoding, toEncoding)

 	if err == nil {
-		// call Convert
+		// call converter's Convert
 		bytesRead, bytesWritten, err = converter.Convert(input, output)

+		if err == nil {
+			var shiftBytesWritten int
+			
+			// call Convert with a nil input to generate any end shift sequences
+			_, shiftBytesWritten, err = converter.Convert(nil, output[bytesWritten:])
+			
+			// add shift bytes to total bytes
+			bytesWritten += shiftBytesWritten
+		}
+
 		// close the converter
 		converter.Close()
 	}
@ -32,8 +49,9 @@ func Convert(input []byte, output []byte, fromEncoding string, toEncoding string
 	return
 }

+// All in one ConvertString method, rather than requiring the construction of an iconv.Converter
 func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err Error) {
-	// create a new converter
+	// create a temporary converter
 	converter, err := NewConverter(fromEncoding, toEncoding)

 	if err == nil {