Improving documentation and including shift reset logic

This commit is contained in:
Donovan Jimenez 2011-01-29 01:31:00 -05:00
parent 20aa6d93c3
commit 5ea739d3eb
4 changed files with 206 additions and 104 deletions

View File

@ -1,3 +1,4 @@
# standard GO make file preamble
include $(GOROOT)/src/Make.inc
# target package name
@ -6,10 +7,17 @@ TARG=iconv
# regular go files
GOFILES=\
reader.go\
writer.go\
# files that must be processed by cgo
CGOFILES=\
converter.go\
iconv.go\
include $(GOROOT)/src/Make.pkg
# on non glibc systems, we usually need to load the library
ifneq ($(GOOS),linux)
CGO_LDFLAGS=-liconv
endif
# standard GO make file include for packages
include $(GOROOT)/src/Make.pkg

117
README.md
View File

@ -1,72 +1,115 @@
Install
=======
# Install
The goinstall command can be used:
goinstall github.com/djimenez/iconv.go
Or, you can clone the repository and use gomake instead
The main method of installation is through gomake (provided in $GOROOT/bin)
git clone git://github.com/djimenez/iconv.go.git iconv
cd iconv
gomake install
Usage
=====
Alternatively, you can try using goinstall (also provided in $GOROOT/bin).
However, because iconv.go uses cgo to wrap iconv functions, the build may not
succeed on all systems. At time of writing goinstall was still experimental and
has known issues with cgo based packages because of how it produces its own
make file.
goinstall github.com/djimenez/iconv.go
# Usage
To use the package, you'll need the appropriate import statement:
import (
// if you used goinstall, you'll want this import
iconv "github.com/djimenez/iconv.go"
// if you used gomake install directly, you'll want this import
iconv
// if you used goinstall, you'll want this import
iconv "github.com/djimenez/iconv.go"
)
Converting string Values
------------------------
## Converting string Values
Converting a string can be done with two methods. First, there's iconv.ConvertString(input, fromEncoding, toEncoding string)
Converting a string can be done with two methods. First, there's
iconv.ConvertString(input, fromEncoding, toEncoding string)
output,_ := iconv.ConvertString("Hello World!", "utf-8", "windows-1252")
Alternatively, you can create a converter and use its ConvertString method. This mostly just saves having to parse the from and to encodings when converting many strings in the same way.
Alternatively, you can create a converter and use its ConvertString method.
Reuse of a Converter instance is recommended when doing many string conversions
between the same encodings.
converter := iconv.NewConverter("utf-8", "windows-1252")
output,_ := converter.ConvertString("Hello World!")
Converting []byte Values
------------------------
Converting a []byte can similarly be done with two methods. First, there's iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll immediately notice this requires you to give it both the input and output buffer. Ideally, the output buffer should be sized so that it can hold all converted bytes from input, but if it cannot, then Convert will put as many bytes as it can into the buffer without creating an invalid sequence. For example, if iconv only has a single byte left in the output buffer but needs 2 or more for the complete character in a multibyte encoding it will stop writing to the buffer and return with an iconv.E2BIG error.
input := []byte("Hello World!")
output := make([]byte, len(input))
bytesRead, bytesWritten, error := iconv.Convert(input, output, "utf-8", "windows-1252")
// converter can then be closed explicitly
// this will also happen when garbage collected
converter.Close()
Just like with ConvertString, there is also a Convert method on Converter that can be used.
ConvertString may return errors for the following reasons:
* EINVAL - when either the from or to encoding is not supported by iconv
* EILSEQ - when the input string contains an invalid byte sequence for the
given from encoding
## Converting []byte Values
Converting a []byte can similarly be done with two methods. First, there's
iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll
immediately notice this requires you to give it both the input and output
buffer. Ideally, the output buffer should be sized so that it can hold all
converted bytes from input, but if it cannot, then Convert will put as many
bytes as it can into the buffer without creating an invalid sequence. For
example, if iconv only has a single byte left in the output buffer but needs 2
or more for the complete character in a multibyte encoding it will stop writing
to the buffer and return with an iconv.E2BIG error.
in := []byte("Hello World!")
out := make([]byte, len(input))
bytesRead, bytesWritten, err := iconv.Convert(in, out, "utf-8", "latin1")
Just like with ConvertString, there is also a Convert method on Converter that
can be used.
...
converter := iconv.NewConverter("utf-8", "windows-1252")
bytesRead, bytesWritten, error := converter.Convert(input, output)
Convert may return errors for the following reasons:
Converting an *io.Reader
------------------------
* EINVAL - when either the from or to encoding is not supported by iconv
* EILSEQ - when the input string contains an invalid byte sequence for the
given from encoding
* E2BIG - when the output buffer is not big enough to hold the full
conversion of input
Note on E2BIG: this is a common error value especially when converting to a
multibyte encoding and should not be considered fatal. Partial conversion
has probably occurred be sure to check bytesRead and bytesWritten.
The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes transcoded as they are read.
### Note on Shift Based Encodings
// We're wrapping stdin for simplicity, but a File or network reader could be wrapped as well
When using iconv.Convert convenience method it will automatically try to append
to your output buffer with a nil input so that any end shift sequences are
appropiately written. Using a Converter.Convert method however will not
automatically do this since it can be used to process a full stream in chunks.
So you'll need to remember to pass a nil input buffer at the end yourself, just
like you would with direct iconv usage.
## Converting an *io.Reader
The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes
transcoded as they are read.
// We're wrapping stdin for simplicity, but a File or network reader could
// be wrapped as well
reader,_ := iconv.NewReader(os.Stdin, "utf-8", "windows-1252")
Converting an *io.Writer
------------------------
## Converting an *io.Writer
To be written.
The iconv.Writer allows any other *io.Writer to be wrapped and have its bytes
transcoded as they are written.
Piping a Conversion
-------------------
To be written.
// We're wrapping stdout for simplicity, but a File or network reader could
// be wrapped as well
writer,_ := iconv.NewWriter(os.Stdout, "utf-8", "windows-1252")

View File

@ -1,25 +1,25 @@
package iconv
/*
#include <iconv.h>
#include <stdlib.h>
#include <iconv.h>
*/
import "C"
import (
"os"
"unsafe"
)
import "os"
import "unsafe"
type Converter struct {
context C.iconv_t
open bool
}
// Initialize a new Converter. If fromEncoding or toEncoding are not supported by
// iconv then an EINVAL error will be returned. An ENOMEM error maybe returned if
// there is not enough memory to initialize an iconv descriptor
func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err Error) {
converter = new(Converter)
// create C strings
// convert to C strings
toEncodingC := C.CString(toEncoding)
fromEncodingC := C.CString(fromEncoding)
@ -39,12 +39,12 @@ func NewConverter(fromEncoding string, toEncoding string) (converter *Converter,
return
}
// Called before garbage collection
// destroy is called during garbage collection
func (this *Converter) destroy() {
this.Close()
}
// The converter can be explicitly closed if desired
// Close a Converter's iconv description explicitly
func (this *Converter) Close() (err os.Error) {
if this.open {
_, err = C.iconv_close(this.context)
@ -53,73 +53,106 @@ func (this *Converter) Close() (err os.Error) {
return
}
// read bytes from an input buffer, and write them to and output buffer
// will return the number of bytesRead from the input and the number of bytes
// written to the output as well as any iconv errors
// Convert bytes from an input byte slice into a give output byte slice
//
// NOTE: not all bytes may be consumed from the input. This can be because the output
// buffer is too small or because there were iconv errors
// As many bytes that can converted and fit into the size of output will be
// processed and the number of bytes read for input as well as the number of
// bytes written to output will be returned. If not all converted bytes can fit
// into output and E2BIG error will also be returned. If input contains an invalid
// sequence of bytes for the Converter's fromEncoding an EILSEQ error will be returned
//
// For shift based output encodings, any end shift byte sequences can be generated by
// passing a 0 length byte slice as input. Also passing a 0 length byte slice for output
// will simply reset the iconv descriptor shift state without writing any bytes.
func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err Error) {
inputLeft := C.size_t(len(input))
outputLeft := C.size_t(len(output))
if inputLeft > 0 && outputLeft > 0 {
// we're going to give iconv the pointers to the underlying
// storage of each byte slice - so far this is the simplest
// way i've found to do that in Go, but it seems ugly
inputFirstElementPointer := &input[0]
inputPointer := (**C.char)(unsafe.Pointer(&inputFirstElementPointer))
// make sure we are still open
if this.open {
inputLeft := C.size_t(len(input))
outputLeft := C.size_t(len(output))
outputFirstElementPointer := &output[0]
outputPointer := (**C.char)(unsafe.Pointer(&outputFirstElementPointer))
if inputLeft > 0 && outputLeft > 0 {
// we have to give iconv a pointer to a pointer of the underlying
// storage of each byte slice - so far this is the simplest
// way i've found to do that in Go, but it seems ugly
inputPointer := (*C.char)(unsafe.Pointer(&input[0]))
outputPointer := (*C.char)(unsafe.Pointer(&output[0]))
// we're only going to make one call to iconv
_,err = C.iconv(this.context, inputPointer, &inputLeft, outputPointer, &outputLeft)
_,err = C.iconv(this.context, &inputPointer, &inputLeft, &outputPointer, &outputLeft)
// update byte counters
bytesRead = len(input) - int(inputLeft)
bytesWritten = len(output) - int(outputLeft)
// update byte counters
bytesRead = len(input) - int(inputLeft)
bytesWritten = len(output) - int(outputLeft)
} else if inputLeft == 0 && outputLeft > 0 {
// inputPointer will be nil, outputPointer is generated as above
outputPointer := (*C.char)(unsafe.Pointer(&output[0]))
_,err = C.iconv(this.context, nil, &inputLeft, &outputPointer, &outputLeft)
// update write byte counter
bytesWritten = len(output) - int(outputLeft)
} else {
// both input and output are zero length, do a shift state reset
_,err = C.iconv(this.context, nil, &inputLeft, nil, &outputLeft)
}
} else {
err = EBADF
}
return bytesRead, bytesWritten, err
}
// convert a string value, returning a new string value
// Convert an input string
//
// EILSEQ error may be returned if input contains invalid bytes for the
// Converter's fromEncoding.
func (this *Converter) ConvertString(input string) (output string, err Error) {
// make sure we are still open
if this.open {
// construct the buffers
inputBuffer := []byte(input)
outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later
// construct the buffers
inputBuffer := []byte(input)
outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later
// call Convert until all input bytes are read or an error occurs
var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int
// call Convert until all input bytes are read or an error occurs
var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int
for totalBytesRead < len(inputBuffer) && err == nil {
// use the totals to create buffer slices
bytesRead, bytesWritten, err = this.Convert(inputBuffer[totalBytesRead:], outputBuffer[totalBytesWritten:])
for totalBytesRead < len(inputBuffer) && err == nil {
bytesRead, bytesWritten, err = this.Convert(inputBuffer, outputBuffer)
totalBytesRead += bytesRead
totalBytesWritten += bytesWritten
totalBytesRead += bytesRead
totalBytesWritten += bytesWritten
// check for the E2BIG error specifically, we can add to the output
// buffer to correct for it and then continue
if err == E2BIG {
// increase the size of the output buffer by another input length
// first, create a new buffer
tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
// check for the E2BIG error specifically, we can add to the output
// buffer to correct for it and then continue
if err == E2BIG {
// increase the size of the output buffer by another input length
// first, create a new buffer
tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
// copy the existing data
copy(tempBuffer, outputBuffer)
// copy the existing data
copy(tempBuffer, outputBuffer)
// switch the buffers
outputBuffer = tempBuffer
// switch the buffers
outputBuffer = tempBuffer
// forget the error
err = nil
// forget the error
err = nil
}
}
if err == nil {
// perform a final shift state reset
_, bytesWritten, err = this.Convert([]byte{}, outputBuffer[totalBytesWritten:])
// update total count
totalBytesWritten += bytesWritten
}
// construct the final output string
output = string(outputBuffer[:totalBytesWritten])
} else {
err = EBADF
}
// construct the final output string
output = string(outputBuffer[:totalBytesWritten])
return output, err
}

View File

@ -1,30 +1,47 @@
/*
Wraps the iconv API present on most systems, which allows for conversion
of bytes from one encoding to another. This package additionally provides
some convenient interface implementations like a Reader and Writer.
*/
package iconv
/*
#include <errno.h>
*/
import "C"
import "os"
import (
"os"
)
// allows us to check for iconv specific errors
// Alias os.Error for convenience
type Error os.Error
// Error codes returned from iconv functions
var (
EILSEQ Error = os.Errno(int(C.EILSEQ))
E2BIG Error = os.Errno(int(C.E2BIG))
EBADF Error = os.Errno(int(C.EBADF))
EINVAL Error = os.Errno(int(C.EINVAL))
EILSEQ Error = os.Errno(int(C.EILSEQ))
ENOMEM Error = os.Errno(int(C.ENOMEM))
)
// All in one Convert method, rather than requiring the construction of an iconv.Converter
func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err Error) {
// create a new converter
// create a temporary converter
converter, err := NewConverter(fromEncoding, toEncoding)
if err == nil {
// call Convert
// call converter's Convert
bytesRead, bytesWritten, err = converter.Convert(input, output)
if err == nil {
var shiftBytesWritten int
// call Convert with a nil input to generate any end shift sequences
_, shiftBytesWritten, err = converter.Convert(nil, output[bytesWritten:])
// add shift bytes to total bytes
bytesWritten += shiftBytesWritten
}
// close the converter
converter.Close()
}
@ -32,8 +49,9 @@ func Convert(input []byte, output []byte, fromEncoding string, toEncoding string
return
}
// All in one ConvertString method, rather than requiring the construction of an iconv.Converter
func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err Error) {
// create a new converter
// create a temporary converter
converter, err := NewConverter(fromEncoding, toEncoding)
if err == nil {