From 1ebb3d5fcc8521363a3f18f3e95f759034dd2e94 Mon Sep 17 00:00:00 2001 From: Joe Higton Date: Tue, 10 Jun 2014 03:23:18 +0100 Subject: [PATCH 1/3] Wrap disconnect errors and allow reconnect --- worker/agent.go | 34 ++++++- worker/worker.go | 32 ++++++- worker/worker_disconnect_test.go | 147 +++++++++++++++++++++++-------- 3 files changed, 176 insertions(+), 37 deletions(-) diff --git a/worker/agent.go b/worker/agent.go index 194545b..310cca8 100644 --- a/worker/agent.go +++ b/worker/agent.go @@ -46,6 +46,7 @@ func (a *agent) work() { a.worker.err(err.(error)) } }() + var inpack *inPack var l int var err error @@ -56,7 +57,17 @@ func (a *agent) work() { if opErr.Temporary() { continue }else{ - a.worker.err(err) + a.Lock() + if( a.conn != nil ){ + a.Unlock() + err = &WorkerDisconnectError{ + OpError : opErr, + agent : a, + } + a.worker.err(err) + } + // else - we're probably dc'ing due to a Close() + break } @@ -107,6 +118,10 @@ func (a *agent) Close() { func (a *agent) Grab() { a.Lock() defer a.Unlock() + a.grab() +} + +func (a *agent) grab(){ outpack := getOutPack() outpack.dataType = dtGrabJobUniq a.write(outpack) @@ -120,6 +135,23 @@ func (a *agent) PreSleep() { a.write(outpack) } +func (a *agent) reconnect() (error){ + a.Lock() + defer a.Unlock() + conn, err := net.Dial(a.net, a.addr) + if err != nil { + return err; + } + a.conn = conn + a.rw = bufio.NewReadWriter(bufio.NewReader(a.conn), + bufio.NewWriter(a.conn)) + a.grab() + a.worker.reRegisterFuncsForAgent(a) + + go a.work() + return nil +} + // read length bytes from the socket func (a *agent) read(length int) (data []byte, err error) { n := 0 diff --git a/worker/worker.go b/worker/worker.go index a81f4ab..3b9f695 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -7,6 +7,7 @@ import ( "fmt" "sync" "time" + "net" ) const ( @@ -30,6 +31,7 @@ type Worker struct { limit chan bool } + // Return a worker. // // If limit is set to Unlimited(=0), the worker will grab all jobs @@ -94,6 +96,11 @@ func (worker *Worker) AddFunc(funcname string, // inner add func (worker *Worker) addFunc(funcname string, timeout uint32) { + outpack := prepFuncOutpack( funcname, timeout) + worker.broadcast(outpack) +} + +func prepFuncOutpack(funcname string, timeout uint32) (*outPack){ outpack := getOutPack() if timeout == 0 { outpack.dataType = dtCanDo @@ -106,7 +113,7 @@ func (worker *Worker) addFunc(funcname string, timeout uint32) { outpack.data[l] = '\x00' binary.BigEndian.PutUint32(outpack.data[l+1:], timeout) } - worker.broadcast(outpack) + return outpack } // Remove a function. @@ -293,6 +300,15 @@ func (worker *Worker) exec(inpack *inPack) (err error) { } return } +func (worker *Worker)reRegisterFuncsForAgent( a * agent ){ + worker.Lock() + defer worker.Unlock() + for funcname, f := range worker.funcs { + outpack := prepFuncOutpack( funcname, f.timeout) + a.write(outpack) + } + +} // inner result type result struct { @@ -316,3 +332,17 @@ func execTimeout(f JobFunc, job Job, timeout time.Duration) (r *result) { } return r } + +// Error type passed when a worker connection disconnects +type WorkerDisconnectError struct{ + *net.OpError + agent * agent +} +// Responds to the error by asking the worker to reconnect +func (e *WorkerDisconnectError) Reconnect() ( err error ){ + return e.agent.reconnect() +} +// Which server was this for? +func(e *WorkerDisconnectError) Server() ( net string, addr string ){ + return e.agent.net, e.agent.addr +} diff --git a/worker/worker_disconnect_test.go b/worker/worker_disconnect_test.go index e87e228..8925e7a 100644 --- a/worker/worker_disconnect_test.go +++ b/worker/worker_disconnect_test.go @@ -20,8 +20,8 @@ func init() { if check_gearman_present() { panic(`Something already listening on our testing port. Chickening out of testing with it!`) } - gearman_ready = make( chan bool ) - kill_gearman = make( chan bool ) + gearman_ready = make(chan bool) + kill_gearman = make(chan bool) // TODO: verify port is clear go run_gearman() } @@ -61,7 +61,6 @@ func check_gearman_present() bool { log.Println("gearman not ready " + err.Error()) return false } - log.Println("gearman ready") con.Close() return true } @@ -81,38 +80,32 @@ func check_gearman_is_dead() bool { Checks for a disconnect whilst not working */ func TestBasicDisconnect(t *testing.T) { - <- gearman_ready - + <-gearman_ready + worker := New(Unlimited) timeout := make(chan bool, 1) - done := make( chan bool, 1) + done := make(chan bool, 1) - if err := worker.AddServer(Network, "127.0.0.1:" + port); err != nil { + if err := worker.AddServer(Network, "127.0.0.1:"+port); err != nil { t.Error(err) } - work_done := false; - if err := worker.AddFunc("gearman-go-workertest", - func(j Job)(b []byte, e error){ - work_done = true; - done <- true - return}, 0); - err != nil { + work_done := false + if err := worker.AddFunc("gearman-go-workertest", + func(j Job) (b []byte, e error) { + work_done = true + done <- true + return + }, 0); err != nil { t.Error(err) } - - worker.JobHandler = func( j Job ) error { - if( ! worker.ready ){ - t.Error("Worker not ready as expected"); - } - done <-true - return nil - } - handled_errors := false - - c_error := make( chan bool) - worker.ErrorHandler = func( e error ){ - log.Println( e ) + handled_errors := false + + c_error := make(chan bool) + was_dc_err := false + worker.ErrorHandler = func(e error) { + log.Println(e) + _, was_dc_err = e.(*WorkerDisconnectError) handled_errors = true c_error <- true } @@ -149,22 +142,106 @@ func TestBasicDisconnect(t *testing.T) { t.Error("Test timed out waiting for the error handler") case <-c_error: // error was handled! + if ! was_dc_err { + t.Error( "Disconnect didn't manifest as a net.OpError?") + } + } + worker.Close() + kill_gearman <- true + +} + +func TestDcRc(t *testing.T) { + check_gearman_is_dead() + go run_gearman() + + <-gearman_ready + + worker := New(Unlimited) + timeout := make(chan bool, 1) + done := make(chan bool, 1) + + if err := worker.AddServer(Network, "127.0.0.1:"+port); err != nil { + t.Error(err) + } + work_done := false + if err := worker.AddFunc("gearman-go-workertest", + func(j Job) (b []byte, e error) { + log.Println("Actual work happens!") + work_done = true + done <- true + return + }, 0); err != nil { + t.Error(err) } + worker.ErrorHandler = func(e error) { + wdc, wdcok := e.(*WorkerDisconnectError) + + if( wdcok){ + log.Println("Reconnecting!") + reconnected := false + for tries := 20 ; ! reconnected && tries > 0; tries -- { + rcerr := wdc.Reconnect() + if rcerr != nil{ + time.Sleep(250 * time.Millisecond) + } else{ + reconnected = true; + } + } + + + } else{ + panic("Some other kind of error " + e.Error()) + } + + } + + go func() { + time.Sleep(5 * time.Second) + timeout <- true + }() + + err := worker.Ready() + + if err != nil { + t.Error(err) + } + + go worker.Work() + + kill_gearman <- true + + check_gearman_is_dead() + go run_gearman() + + select { + case <-gearman_ready: + case <-timeout: + } + + send_client_request() + + select { + case <- done: + case <-timeout: + t.Error("Test timed out") + } + worker.Close() kill_gearman <- true } -func send_client_request(){ - log.Println("sending client request"); - c, err := client.New( Network, "127.0.0.1:" + port ) +func send_client_request() { + log.Println("sending client request") + c, err := client.New(Network, "127.0.0.1:"+port) if err == nil { _, err = c.DoBg("gearman-go-workertest", []byte{}, client.JobHigh) if err != nil { - log.Println( "error sending client request " + err.Error() ) + log.Println("error sending client request " + err.Error()) } - - }else{ - log.Println( "error with client " + err.Error() ) + + } else { + log.Println("error with client " + err.Error()) } -} \ No newline at end of file +} From 09c626f48830df67b05f5a54ca5e41428a9286dc Mon Sep 17 00:00:00 2001 From: Joe Higton Date: Tue, 10 Jun 2014 03:46:21 +0100 Subject: [PATCH 2/3] Cope with io.EOF as a disconnect error --- worker/agent.go | 23 ++++++++++++++--------- worker/worker.go | 7 +++++-- worker/worker_disconnect_test.go | 3 --- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/worker/agent.go b/worker/agent.go index 310cca8..9befa9c 100644 --- a/worker/agent.go +++ b/worker/agent.go @@ -4,6 +4,7 @@ import ( "bufio" "net" "sync" + "io" ) // The agent of job server. @@ -57,20 +58,14 @@ func (a *agent) work() { if opErr.Temporary() { continue }else{ - a.Lock() - if( a.conn != nil ){ - a.Unlock() - err = &WorkerDisconnectError{ - OpError : opErr, - agent : a, - } - a.worker.err(err) - } + a.disconnect_error(err) // else - we're probably dc'ing due to a Close() break } + } else if( err == io.EOF ){ + a.disconnect_error(err) } a.worker.err(err) // If it is unexpected error and the connection wasn't @@ -106,6 +101,16 @@ func (a *agent) work() { } } +func (a * agent) disconnect_error( err error ){ + if( a.conn != nil ){ + err = &WorkerDisconnectError{ + err : err, + agent : a, + } + a.worker.err(err) + } +} + func (a *agent) Close() { a.Lock() defer a.Unlock() diff --git a/worker/worker.go b/worker/worker.go index 3b9f695..0bb7b4c 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -7,7 +7,6 @@ import ( "fmt" "sync" "time" - "net" ) const ( @@ -335,9 +334,13 @@ func execTimeout(f JobFunc, job Job, timeout time.Duration) (r *result) { // Error type passed when a worker connection disconnects type WorkerDisconnectError struct{ - *net.OpError + err error agent * agent } +func (e *WorkerDisconnectError) Error() ( string){ + return e.err.Error(); +} + // Responds to the error by asking the worker to reconnect func (e *WorkerDisconnectError) Reconnect() ( err error ){ return e.agent.reconnect() diff --git a/worker/worker_disconnect_test.go b/worker/worker_disconnect_test.go index 8925e7a..34a3621 100644 --- a/worker/worker_disconnect_test.go +++ b/worker/worker_disconnect_test.go @@ -36,7 +36,6 @@ func run_gearman() { // Make sure we clear up our gearman: defer func() { - log.Println("killing gearmand") gm_cmd.Process.Kill() }() @@ -58,7 +57,6 @@ func run_gearman() { func check_gearman_present() bool { con, err := net.Dial(`tcp`, `127.0.0.1:`+port) if err != nil { - log.Println("gearman not ready " + err.Error()) return false } con.Close() @@ -233,7 +231,6 @@ func TestDcRc(t *testing.T) { } func send_client_request() { - log.Println("sending client request") c, err := client.New(Network, "127.0.0.1:"+port) if err == nil { _, err = c.DoBg("gearman-go-workertest", []byte{}, client.JobHigh) From 97731e177477b18a0931850228ccb22e005c61d3 Mon Sep 17 00:00:00 2001 From: Joe Higton Date: Tue, 10 Jun 2014 04:09:27 +0100 Subject: [PATCH 3/3] FIX: EOF disconnect error also called raw handler afterwards --- worker/agent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/worker/agent.go b/worker/agent.go index 9befa9c..7385211 100644 --- a/worker/agent.go +++ b/worker/agent.go @@ -66,6 +66,7 @@ func (a *agent) work() { } else if( err == io.EOF ){ a.disconnect_error(err) + break } a.worker.err(err) // If it is unexpected error and the connection wasn't