Improving documentation and including shift reset logic
This commit is contained in:
parent
20aa6d93c3
commit
5ea739d3eb
8
Makefile
8
Makefile
@ -1,3 +1,4 @@
|
|||||||
|
# standard GO make file preamble
|
||||||
include $(GOROOT)/src/Make.inc
|
include $(GOROOT)/src/Make.inc
|
||||||
|
|
||||||
# target package name
|
# target package name
|
||||||
@ -6,10 +7,17 @@ TARG=iconv
|
|||||||
# regular go files
|
# regular go files
|
||||||
GOFILES=\
|
GOFILES=\
|
||||||
reader.go\
|
reader.go\
|
||||||
|
writer.go\
|
||||||
|
|
||||||
# files that must be processed by cgo
|
# files that must be processed by cgo
|
||||||
CGOFILES=\
|
CGOFILES=\
|
||||||
converter.go\
|
converter.go\
|
||||||
iconv.go\
|
iconv.go\
|
||||||
|
|
||||||
|
# on non glibc systems, we usually need to load the library
|
||||||
|
ifneq ($(GOOS),linux)
|
||||||
|
CGO_LDFLAGS=-liconv
|
||||||
|
endif
|
||||||
|
|
||||||
|
# standard GO make file include for packages
|
||||||
include $(GOROOT)/src/Make.pkg
|
include $(GOROOT)/src/Make.pkg
|
111
README.md
111
README.md
@ -1,72 +1,115 @@
|
|||||||
Install
|
# Install
|
||||||
=======
|
|
||||||
|
|
||||||
The goinstall command can be used:
|
The main method of installation is through gomake (provided in $GOROOT/bin)
|
||||||
|
|
||||||
goinstall github.com/djimenez/iconv.go
|
|
||||||
|
|
||||||
Or, you can clone the repository and use gomake instead
|
|
||||||
|
|
||||||
git clone git://github.com/djimenez/iconv.go.git iconv
|
git clone git://github.com/djimenez/iconv.go.git iconv
|
||||||
cd iconv
|
cd iconv
|
||||||
gomake install
|
gomake install
|
||||||
|
|
||||||
Usage
|
Alternatively, you can try using goinstall (also provided in $GOROOT/bin).
|
||||||
=====
|
However, because iconv.go uses cgo to wrap iconv functions, the build may not
|
||||||
|
succeed on all systems. At time of writing goinstall was still experimental and
|
||||||
|
has known issues with cgo based packages because of how it produces its own
|
||||||
|
make file.
|
||||||
|
|
||||||
|
goinstall github.com/djimenez/iconv.go
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
To use the package, you'll need the appropriate import statement:
|
To use the package, you'll need the appropriate import statement:
|
||||||
|
|
||||||
import (
|
import (
|
||||||
// if you used goinstall, you'll want this import
|
|
||||||
iconv "github.com/djimenez/iconv.go"
|
|
||||||
|
|
||||||
// if you used gomake install directly, you'll want this import
|
// if you used gomake install directly, you'll want this import
|
||||||
iconv
|
iconv
|
||||||
|
|
||||||
|
// if you used goinstall, you'll want this import
|
||||||
|
iconv "github.com/djimenez/iconv.go"
|
||||||
)
|
)
|
||||||
|
|
||||||
Converting string Values
|
## Converting string Values
|
||||||
------------------------
|
|
||||||
|
|
||||||
Converting a string can be done with two methods. First, there's iconv.ConvertString(input, fromEncoding, toEncoding string)
|
Converting a string can be done with two methods. First, there's
|
||||||
|
iconv.ConvertString(input, fromEncoding, toEncoding string)
|
||||||
|
|
||||||
output,_ := iconv.ConvertString("Hello World!", "utf-8", "windows-1252")
|
output,_ := iconv.ConvertString("Hello World!", "utf-8", "windows-1252")
|
||||||
|
|
||||||
Alternatively, you can create a converter and use its ConvertString method. This mostly just saves having to parse the from and to encodings when converting many strings in the same way.
|
Alternatively, you can create a converter and use its ConvertString method.
|
||||||
|
Reuse of a Converter instance is recommended when doing many string conversions
|
||||||
|
between the same encodings.
|
||||||
|
|
||||||
converter := iconv.NewConverter("utf-8", "windows-1252")
|
converter := iconv.NewConverter("utf-8", "windows-1252")
|
||||||
output,_ := converter.ConvertString("Hello World!")
|
output,_ := converter.ConvertString("Hello World!")
|
||||||
|
|
||||||
Converting []byte Values
|
// converter can then be closed explicitly
|
||||||
------------------------
|
// this will also happen when garbage collected
|
||||||
|
converter.Close()
|
||||||
|
|
||||||
Converting a []byte can similarly be done with two methods. First, there's iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll immediately notice this requires you to give it both the input and output buffer. Ideally, the output buffer should be sized so that it can hold all converted bytes from input, but if it cannot, then Convert will put as many bytes as it can into the buffer without creating an invalid sequence. For example, if iconv only has a single byte left in the output buffer but needs 2 or more for the complete character in a multibyte encoding it will stop writing to the buffer and return with an iconv.E2BIG error.
|
ConvertString may return errors for the following reasons:
|
||||||
|
|
||||||
input := []byte("Hello World!")
|
* EINVAL - when either the from or to encoding is not supported by iconv
|
||||||
output := make([]byte, len(input))
|
* EILSEQ - when the input string contains an invalid byte sequence for the
|
||||||
|
given from encoding
|
||||||
|
|
||||||
bytesRead, bytesWritten, error := iconv.Convert(input, output, "utf-8", "windows-1252")
|
## Converting []byte Values
|
||||||
|
|
||||||
Just like with ConvertString, there is also a Convert method on Converter that can be used.
|
Converting a []byte can similarly be done with two methods. First, there's
|
||||||
|
iconv.Convert(input, output []byte, fromEncoding, toEncoding string). You'll
|
||||||
|
immediately notice this requires you to give it both the input and output
|
||||||
|
buffer. Ideally, the output buffer should be sized so that it can hold all
|
||||||
|
converted bytes from input, but if it cannot, then Convert will put as many
|
||||||
|
bytes as it can into the buffer without creating an invalid sequence. For
|
||||||
|
example, if iconv only has a single byte left in the output buffer but needs 2
|
||||||
|
or more for the complete character in a multibyte encoding it will stop writing
|
||||||
|
to the buffer and return with an iconv.E2BIG error.
|
||||||
|
|
||||||
|
in := []byte("Hello World!")
|
||||||
|
out := make([]byte, len(input))
|
||||||
|
|
||||||
|
bytesRead, bytesWritten, err := iconv.Convert(in, out, "utf-8", "latin1")
|
||||||
|
|
||||||
|
Just like with ConvertString, there is also a Convert method on Converter that
|
||||||
|
can be used.
|
||||||
|
|
||||||
...
|
...
|
||||||
converter := iconv.NewConverter("utf-8", "windows-1252")
|
converter := iconv.NewConverter("utf-8", "windows-1252")
|
||||||
|
|
||||||
bytesRead, bytesWritten, error := converter.Convert(input, output)
|
bytesRead, bytesWritten, error := converter.Convert(input, output)
|
||||||
|
|
||||||
Converting an *io.Reader
|
Convert may return errors for the following reasons:
|
||||||
------------------------
|
|
||||||
|
|
||||||
The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes transcoded as they are read.
|
* EINVAL - when either the from or to encoding is not supported by iconv
|
||||||
|
* EILSEQ - when the input string contains an invalid byte sequence for the
|
||||||
|
given from encoding
|
||||||
|
* E2BIG - when the output buffer is not big enough to hold the full
|
||||||
|
conversion of input
|
||||||
|
|
||||||
// We're wrapping stdin for simplicity, but a File or network reader could be wrapped as well
|
Note on E2BIG: this is a common error value especially when converting to a
|
||||||
|
multibyte encoding and should not be considered fatal. Partial conversion
|
||||||
|
has probably occurred be sure to check bytesRead and bytesWritten.
|
||||||
|
|
||||||
|
### Note on Shift Based Encodings
|
||||||
|
|
||||||
|
When using iconv.Convert convenience method it will automatically try to append
|
||||||
|
to your output buffer with a nil input so that any end shift sequences are
|
||||||
|
appropiately written. Using a Converter.Convert method however will not
|
||||||
|
automatically do this since it can be used to process a full stream in chunks.
|
||||||
|
So you'll need to remember to pass a nil input buffer at the end yourself, just
|
||||||
|
like you would with direct iconv usage.
|
||||||
|
|
||||||
|
## Converting an *io.Reader
|
||||||
|
|
||||||
|
The iconv.Reader allows any other *io.Reader to be wrapped and have its bytes
|
||||||
|
transcoded as they are read.
|
||||||
|
|
||||||
|
// We're wrapping stdin for simplicity, but a File or network reader could
|
||||||
|
// be wrapped as well
|
||||||
reader,_ := iconv.NewReader(os.Stdin, "utf-8", "windows-1252")
|
reader,_ := iconv.NewReader(os.Stdin, "utf-8", "windows-1252")
|
||||||
|
|
||||||
Converting an *io.Writer
|
## Converting an *io.Writer
|
||||||
------------------------
|
|
||||||
|
|
||||||
To be written.
|
The iconv.Writer allows any other *io.Writer to be wrapped and have its bytes
|
||||||
|
transcoded as they are written.
|
||||||
|
|
||||||
Piping a Conversion
|
// We're wrapping stdout for simplicity, but a File or network reader could
|
||||||
-------------------
|
// be wrapped as well
|
||||||
|
writer,_ := iconv.NewWriter(os.Stdout, "utf-8", "windows-1252")
|
||||||
To be written.
|
|
||||||
|
143
converter.go
143
converter.go
@ -1,25 +1,25 @@
|
|||||||
package iconv
|
package iconv
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#include <iconv.h>
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <iconv.h>
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
import "os"
|
||||||
import (
|
import "unsafe"
|
||||||
"os"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Converter struct {
|
type Converter struct {
|
||||||
context C.iconv_t
|
context C.iconv_t
|
||||||
open bool
|
open bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize a new Converter. If fromEncoding or toEncoding are not supported by
|
||||||
|
// iconv then an EINVAL error will be returned. An ENOMEM error maybe returned if
|
||||||
|
// there is not enough memory to initialize an iconv descriptor
|
||||||
func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err Error) {
|
func NewConverter(fromEncoding string, toEncoding string) (converter *Converter, err Error) {
|
||||||
converter = new(Converter)
|
converter = new(Converter)
|
||||||
|
|
||||||
// create C strings
|
// convert to C strings
|
||||||
toEncodingC := C.CString(toEncoding)
|
toEncodingC := C.CString(toEncoding)
|
||||||
fromEncodingC := C.CString(fromEncoding)
|
fromEncodingC := C.CString(fromEncoding)
|
||||||
|
|
||||||
@ -39,12 +39,12 @@ func NewConverter(fromEncoding string, toEncoding string) (converter *Converter,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Called before garbage collection
|
// destroy is called during garbage collection
|
||||||
func (this *Converter) destroy() {
|
func (this *Converter) destroy() {
|
||||||
this.Close()
|
this.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
// The converter can be explicitly closed if desired
|
// Close a Converter's iconv description explicitly
|
||||||
func (this *Converter) Close() (err os.Error) {
|
func (this *Converter) Close() (err os.Error) {
|
||||||
if this.open {
|
if this.open {
|
||||||
_, err = C.iconv_close(this.context)
|
_, err = C.iconv_close(this.context)
|
||||||
@ -53,73 +53,106 @@ func (this *Converter) Close() (err os.Error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// read bytes from an input buffer, and write them to and output buffer
|
// Convert bytes from an input byte slice into a give output byte slice
|
||||||
// will return the number of bytesRead from the input and the number of bytes
|
|
||||||
// written to the output as well as any iconv errors
|
|
||||||
//
|
//
|
||||||
// NOTE: not all bytes may be consumed from the input. This can be because the output
|
// As many bytes that can converted and fit into the size of output will be
|
||||||
// buffer is too small or because there were iconv errors
|
// processed and the number of bytes read for input as well as the number of
|
||||||
|
// bytes written to output will be returned. If not all converted bytes can fit
|
||||||
|
// into output and E2BIG error will also be returned. If input contains an invalid
|
||||||
|
// sequence of bytes for the Converter's fromEncoding an EILSEQ error will be returned
|
||||||
|
//
|
||||||
|
// For shift based output encodings, any end shift byte sequences can be generated by
|
||||||
|
// passing a 0 length byte slice as input. Also passing a 0 length byte slice for output
|
||||||
|
// will simply reset the iconv descriptor shift state without writing any bytes.
|
||||||
func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err Error) {
|
func (this *Converter) Convert(input []byte, output []byte) (bytesRead int, bytesWritten int, err Error) {
|
||||||
inputLeft := C.size_t(len(input))
|
// make sure we are still open
|
||||||
outputLeft := C.size_t(len(output))
|
if this.open {
|
||||||
|
inputLeft := C.size_t(len(input))
|
||||||
|
outputLeft := C.size_t(len(output))
|
||||||
|
|
||||||
if inputLeft > 0 && outputLeft > 0 {
|
if inputLeft > 0 && outputLeft > 0 {
|
||||||
// we're going to give iconv the pointers to the underlying
|
// we have to give iconv a pointer to a pointer of the underlying
|
||||||
// storage of each byte slice - so far this is the simplest
|
// storage of each byte slice - so far this is the simplest
|
||||||
// way i've found to do that in Go, but it seems ugly
|
// way i've found to do that in Go, but it seems ugly
|
||||||
inputFirstElementPointer := &input[0]
|
inputPointer := (*C.char)(unsafe.Pointer(&input[0]))
|
||||||
inputPointer := (**C.char)(unsafe.Pointer(&inputFirstElementPointer))
|
outputPointer := (*C.char)(unsafe.Pointer(&output[0]))
|
||||||
|
|
||||||
outputFirstElementPointer := &output[0]
|
_,err = C.iconv(this.context, &inputPointer, &inputLeft, &outputPointer, &outputLeft)
|
||||||
outputPointer := (**C.char)(unsafe.Pointer(&outputFirstElementPointer))
|
|
||||||
|
|
||||||
// we're only going to make one call to iconv
|
// update byte counters
|
||||||
_,err = C.iconv(this.context, inputPointer, &inputLeft, outputPointer, &outputLeft)
|
bytesRead = len(input) - int(inputLeft)
|
||||||
|
bytesWritten = len(output) - int(outputLeft)
|
||||||
|
} else if inputLeft == 0 && outputLeft > 0 {
|
||||||
|
// inputPointer will be nil, outputPointer is generated as above
|
||||||
|
outputPointer := (*C.char)(unsafe.Pointer(&output[0]))
|
||||||
|
|
||||||
// update byte counters
|
_,err = C.iconv(this.context, nil, &inputLeft, &outputPointer, &outputLeft)
|
||||||
bytesRead = len(input) - int(inputLeft)
|
|
||||||
bytesWritten = len(output) - int(outputLeft)
|
// update write byte counter
|
||||||
|
bytesWritten = len(output) - int(outputLeft)
|
||||||
|
} else {
|
||||||
|
// both input and output are zero length, do a shift state reset
|
||||||
|
_,err = C.iconv(this.context, nil, &inputLeft, nil, &outputLeft)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
err = EBADF
|
||||||
}
|
}
|
||||||
|
|
||||||
return bytesRead, bytesWritten, err
|
return bytesRead, bytesWritten, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert a string value, returning a new string value
|
// Convert an input string
|
||||||
|
//
|
||||||
|
// EILSEQ error may be returned if input contains invalid bytes for the
|
||||||
|
// Converter's fromEncoding.
|
||||||
func (this *Converter) ConvertString(input string) (output string, err Error) {
|
func (this *Converter) ConvertString(input string) (output string, err Error) {
|
||||||
|
// make sure we are still open
|
||||||
|
if this.open {
|
||||||
|
// construct the buffers
|
||||||
|
inputBuffer := []byte(input)
|
||||||
|
outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later
|
||||||
|
|
||||||
// construct the buffers
|
// call Convert until all input bytes are read or an error occurs
|
||||||
inputBuffer := []byte(input)
|
var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int
|
||||||
outputBuffer := make([]byte, len(inputBuffer) * 2) // we use a larger buffer to help avoid resizing later
|
|
||||||
|
|
||||||
// call Convert until all input bytes are read or an error occurs
|
for totalBytesRead < len(inputBuffer) && err == nil {
|
||||||
var bytesRead, totalBytesRead, bytesWritten, totalBytesWritten int
|
// use the totals to create buffer slices
|
||||||
|
bytesRead, bytesWritten, err = this.Convert(inputBuffer[totalBytesRead:], outputBuffer[totalBytesWritten:])
|
||||||
|
|
||||||
for totalBytesRead < len(inputBuffer) && err == nil {
|
totalBytesRead += bytesRead
|
||||||
bytesRead, bytesWritten, err = this.Convert(inputBuffer, outputBuffer)
|
totalBytesWritten += bytesWritten
|
||||||
|
|
||||||
totalBytesRead += bytesRead
|
// check for the E2BIG error specifically, we can add to the output
|
||||||
totalBytesWritten += bytesWritten
|
// buffer to correct for it and then continue
|
||||||
|
if err == E2BIG {
|
||||||
|
// increase the size of the output buffer by another input length
|
||||||
|
// first, create a new buffer
|
||||||
|
tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
|
||||||
|
|
||||||
// check for the E2BIG error specifically, we can add to the output
|
// copy the existing data
|
||||||
// buffer to correct for it and then continue
|
copy(tempBuffer, outputBuffer)
|
||||||
if err == E2BIG {
|
|
||||||
// increase the size of the output buffer by another input length
|
|
||||||
// first, create a new buffer
|
|
||||||
tempBuffer := make([]byte, len(outputBuffer) + len(inputBuffer))
|
|
||||||
|
|
||||||
// copy the existing data
|
// switch the buffers
|
||||||
copy(tempBuffer, outputBuffer)
|
outputBuffer = tempBuffer
|
||||||
|
|
||||||
// switch the buffers
|
// forget the error
|
||||||
outputBuffer = tempBuffer
|
err = nil
|
||||||
|
}
|
||||||
// forget the error
|
|
||||||
err = nil
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// construct the final output string
|
if err == nil {
|
||||||
output = string(outputBuffer[:totalBytesWritten])
|
// perform a final shift state reset
|
||||||
|
_, bytesWritten, err = this.Convert([]byte{}, outputBuffer[totalBytesWritten:])
|
||||||
|
|
||||||
|
// update total count
|
||||||
|
totalBytesWritten += bytesWritten
|
||||||
|
}
|
||||||
|
|
||||||
|
// construct the final output string
|
||||||
|
output = string(outputBuffer[:totalBytesWritten])
|
||||||
|
} else {
|
||||||
|
err = EBADF
|
||||||
|
}
|
||||||
|
|
||||||
return output, err
|
return output, err
|
||||||
}
|
}
|
||||||
|
36
iconv.go
36
iconv.go
@ -1,30 +1,47 @@
|
|||||||
|
/*
|
||||||
|
Wraps the iconv API present on most systems, which allows for conversion
|
||||||
|
of bytes from one encoding to another. This package additionally provides
|
||||||
|
some convenient interface implementations like a Reader and Writer.
|
||||||
|
*/
|
||||||
package iconv
|
package iconv
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
|
import "os"
|
||||||
|
|
||||||
import (
|
// Alias os.Error for convenience
|
||||||
"os"
|
|
||||||
)
|
|
||||||
|
|
||||||
// allows us to check for iconv specific errors
|
|
||||||
type Error os.Error
|
type Error os.Error
|
||||||
|
|
||||||
|
// Error codes returned from iconv functions
|
||||||
var (
|
var (
|
||||||
EILSEQ Error = os.Errno(int(C.EILSEQ))
|
|
||||||
E2BIG Error = os.Errno(int(C.E2BIG))
|
E2BIG Error = os.Errno(int(C.E2BIG))
|
||||||
|
EBADF Error = os.Errno(int(C.EBADF))
|
||||||
|
EINVAL Error = os.Errno(int(C.EINVAL))
|
||||||
|
EILSEQ Error = os.Errno(int(C.EILSEQ))
|
||||||
|
ENOMEM Error = os.Errno(int(C.ENOMEM))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// All in one Convert method, rather than requiring the construction of an iconv.Converter
|
||||||
func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err Error) {
|
func Convert(input []byte, output []byte, fromEncoding string, toEncoding string) (bytesRead int, bytesWritten int, err Error) {
|
||||||
// create a new converter
|
// create a temporary converter
|
||||||
converter, err := NewConverter(fromEncoding, toEncoding)
|
converter, err := NewConverter(fromEncoding, toEncoding)
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
// call Convert
|
// call converter's Convert
|
||||||
bytesRead, bytesWritten, err = converter.Convert(input, output)
|
bytesRead, bytesWritten, err = converter.Convert(input, output)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
var shiftBytesWritten int
|
||||||
|
|
||||||
|
// call Convert with a nil input to generate any end shift sequences
|
||||||
|
_, shiftBytesWritten, err = converter.Convert(nil, output[bytesWritten:])
|
||||||
|
|
||||||
|
// add shift bytes to total bytes
|
||||||
|
bytesWritten += shiftBytesWritten
|
||||||
|
}
|
||||||
|
|
||||||
// close the converter
|
// close the converter
|
||||||
converter.Close()
|
converter.Close()
|
||||||
}
|
}
|
||||||
@ -32,8 +49,9 @@ func Convert(input []byte, output []byte, fromEncoding string, toEncoding string
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All in one ConvertString method, rather than requiring the construction of an iconv.Converter
|
||||||
func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err Error) {
|
func ConvertString(input string, fromEncoding string, toEncoding string) (output string, err Error) {
|
||||||
// create a new converter
|
// create a temporary converter
|
||||||
converter, err := NewConverter(fromEncoding, toEncoding)
|
converter, err := NewConverter(fromEncoding, toEncoding)
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user