Merge pull request #1 from Clever/graceful-shutdown

Ability to exit work loop gracefully
This commit is contained in:
kvigen 2014-06-17 11:24:19 -07:00
commit 097037d212
2 changed files with 204 additions and 35 deletions

View File

@ -5,6 +5,7 @@ package worker
import ( import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"log"
"sync" "sync"
"time" "time"
) )
@ -18,11 +19,15 @@ const (
// It can connect to multi-server and grab jobs. // It can connect to multi-server and grab jobs.
type Worker struct { type Worker struct {
sync.Mutex sync.Mutex
agents []*agent agents []*agent
funcs jobFuncs funcs jobFuncs
in chan *inPack in chan *inPack
running bool running bool
ready bool ready bool
// The shuttingDown variable is protected by the Worker lock
shuttingDown bool
// Used during shutdown to wait for all active jobs to finish
activeJobs sync.WaitGroup
Id string Id string
ErrorHandler ErrorHandler ErrorHandler ErrorHandler
@ -137,7 +142,9 @@ func (worker *Worker) handleInPack(inpack *inPack) {
case dtNoJob: case dtNoJob:
inpack.a.PreSleep() inpack.a.PreSleep()
case dtNoop: case dtNoop:
inpack.a.Grab() if !worker.isShuttingDown() {
inpack.a.Grab()
}
case dtJobAssign, dtJobAssignUniq: case dtJobAssign, dtJobAssignUniq:
go func() { go func() {
if err := worker.exec(inpack); err != nil { if err := worker.exec(inpack); err != nil {
@ -147,7 +154,9 @@ func (worker *Worker) handleInPack(inpack *inPack) {
if worker.limit != nil { if worker.limit != nil {
worker.limit <- true worker.limit <- true
} }
inpack.a.Grab() if !worker.isShuttingDown() {
inpack.a.Grab()
}
case dtError: case dtError:
worker.err(inpack.Err()) worker.err(inpack.Err())
fallthrough fallthrough
@ -182,11 +191,12 @@ func (worker *Worker) Ready() (err error) {
// Main loop, block here // Main loop, block here
// Most of time, this should be evaluated in goroutine. // Most of time, this should be evaluated in goroutine.
func (worker *Worker) Work() { func (worker *Worker) Work() {
if ! worker.ready { if !worker.ready {
// didn't run Ready beforehand, so we'll have to do it: // didn't run Ready beforehand, so we'll have to do it:
err := worker.Ready() err := worker.Ready()
if err != nil { if err != nil {
panic( err ) log.Println("Error making worker ready: " + err.Error())
panic(err)
} }
} }
@ -224,6 +234,16 @@ func (worker *Worker) Close() {
} }
} }
// Shutdown server gracefully. This function will block until all active work has finished.
func (worker *Worker) Shutdown() {
worker.Lock()
worker.shuttingDown = true
worker.Unlock()
// Wait for all the active jobs to finish
worker.activeJobs.Wait()
worker.Close()
}
// Echo // Echo
func (worker *Worker) Echo(data []byte) { func (worker *Worker) Echo(data []byte) {
outpack := getOutPack() outpack := getOutPack()
@ -250,6 +270,13 @@ func (worker *Worker) SetId(id string) {
worker.broadcast(outpack) worker.broadcast(outpack)
} }
// IsShutdown checks to see if the worker is in the process of being shutdown.
func (worker *Worker) isShuttingDown() bool {
worker.Lock()
defer worker.Unlock()
return worker.shuttingDown
}
// inner job executing // inner job executing
func (worker *Worker) exec(inpack *inPack) (err error) { func (worker *Worker) exec(inpack *inPack) (err error) {
defer func() { defer func() {
@ -263,7 +290,14 @@ func (worker *Worker) exec(inpack *inPack) (err error) {
err = ErrUnknown err = ErrUnknown
} }
} }
worker.activeJobs.Done()
}() }()
worker.activeJobs.Add(1)
// Make sure that we don't accept any new work from old grab requests
// after we starting shutting down.
if worker.isShuttingDown() {
return
}
f, ok := worker.funcs[inpack.fn] f, ok := worker.funcs[inpack.fn]
if !ok { if !ok {
return fmt.Errorf("The function does not exist: %s", inpack.fn) return fmt.Errorf("The function does not exist: %s", inpack.fn)

View File

@ -1,7 +1,9 @@
package worker package worker
import ( import (
"errors"
"sync" "sync"
"sync/atomic"
"testing" "testing"
"time" "time"
) )
@ -78,12 +80,11 @@ func TestWork(t *testing.T) {
wg.Wait() wg.Wait()
} }
func TestWorkerClose(t *testing.T) { func TestWorkerClose(t *testing.T) {
worker.Close() worker.Close()
} }
func TestWorkWithoutReady(t * testing.T){ func TestWorkWithoutReady(t *testing.T) {
other_worker := New(Unlimited) other_worker := New(Unlimited)
if err := other_worker.AddServer(Network, "127.0.0.1:4730"); err != nil { if err := other_worker.AddServer(Network, "127.0.0.1:4730"); err != nil {
@ -94,13 +95,13 @@ func TestWorkWithoutReady(t * testing.T){
} }
timeout := make(chan bool, 1) timeout := make(chan bool, 1)
done := make( chan bool, 1) done := make(chan bool, 1)
other_worker.JobHandler = func( j Job ) error { other_worker.JobHandler = func(j Job) error {
if( ! other_worker.ready ){ if !other_worker.ready {
t.Error("Worker not ready as expected"); t.Error("Worker not ready as expected")
} }
done <-true done <- true
return nil return nil
} }
go func() { go func() {
@ -108,15 +109,15 @@ func TestWorkWithoutReady(t * testing.T){
timeout <- true timeout <- true
}() }()
go func(){ go func() {
other_worker.Work(); other_worker.Work()
}() }()
// With the all-in-one Work() we don't know if the // With the all-in-one Work() we don't know if the
// worker is ready at this stage so we may have to wait a sec: // worker is ready at this stage so we may have to wait a sec:
go func(){ go func() {
tries := 3 tries := 3
for( tries > 0 ){ for tries > 0 {
if other_worker.ready { if other_worker.ready {
other_worker.Echo([]byte("Hello")) other_worker.Echo([]byte("Hello"))
break break
@ -129,22 +130,22 @@ func TestWorkWithoutReady(t * testing.T){
}() }()
// determine if we've finished or timed out: // determine if we've finished or timed out:
select{ select {
case <- timeout: case <-timeout:
t.Error("Test timed out waiting for the worker") t.Error("Test timed out waiting for the worker")
case <- done: case <-done:
} }
} }
func TestWorkWithoutReadyWithPanic(t * testing.T){ func TestWorkWithoutReadyWithPanic(t *testing.T) {
other_worker := New(Unlimited) other_worker := New(Unlimited)
timeout := make(chan bool, 1) timeout := make(chan bool, 1)
done := make( chan bool, 1) done := make(chan bool, 1)
// Going to work with no worker setup. // Going to work with no worker setup.
// when Work (hopefully) calls Ready it will get an error which should cause it to panic() // when Work (hopefully) calls Ready it will get an error which should cause it to panic()
go func(){ go func() {
defer func() { defer func() {
if err := recover(); err != nil { if err := recover(); err != nil {
done <- true done <- true
@ -153,17 +154,151 @@ func TestWorkWithoutReadyWithPanic(t * testing.T){
t.Error("Work should raise a panic.") t.Error("Work should raise a panic.")
done <- true done <- true
}() }()
other_worker.Work(); other_worker.Work()
}() }()
go func() { go func() {
time.Sleep(2 * time.Second) time.Sleep(2 * time.Second)
timeout <- true timeout <- true
}() }()
select{ select {
case <- timeout: case <-timeout:
t.Error("Test timed out waiting for the worker") t.Error("Test timed out waiting for the worker")
case <- done: case <-done:
} }
} }
// initWorker creates a worker and adds the localhost server to it
func initWorker(t *testing.T) *Worker {
otherWorker := New(Unlimited)
if err := otherWorker.AddServer(Network, "127.0.0.1:4730"); err != nil {
t.Error(err)
}
return otherWorker
}
// submitEmptyInPack sends an empty inpack with the specified fn name to the worker. It uses
// the first agent of the worker.
func submitEmptyInPack(t *testing.T, worker *Worker, function string) {
if l := len(worker.agents); l != 1 {
t.Error("The worker has no agents")
}
inpack := getInPack()
inpack.dataType = dtJobAssign
inpack.fn = function
inpack.a = worker.agents[0]
worker.in <- inpack
}
// TestShutdownSuccessJob tests that shutdown handles active jobs that will succeed
func TestShutdownSuccessJob(t *testing.T) {
otherWorker := initWorker(t)
finishedJob := false
var wg sync.WaitGroup
successJob := func(job Job) ([]byte, error) {
wg.Done()
// Sleep for 10ms to ensure that the shutdown waits for this to finish
time.Sleep(time.Duration(10 * time.Millisecond))
finishedJob = true
return nil, nil
}
if err := otherWorker.AddFunc("test", successJob, 0); err != nil {
t.Error(err)
}
if err := otherWorker.Ready(); err != nil {
t.Error(err)
return
}
submitEmptyInPack(t, otherWorker, "test")
go otherWorker.Work()
// Wait for the success_job to start so that we know we didn't shutdown before even
// beginning to process the job.
wg.Add(1)
wg.Wait()
otherWorker.Shutdown()
if !finishedJob {
t.Error("Didn't finish job")
}
}
// TestShutdownFailureJob tests that shutdown handles active jobs that will fail
func TestShutdownFailureJob(t *testing.T) {
otherWorker := initWorker(t)
var wg sync.WaitGroup
finishedJob := false
failureJob := func(job Job) ([]byte, error) {
wg.Done()
// Sleep for 10ms to ensure that shutdown waits for this to finish
time.Sleep(time.Duration(10 * time.Millisecond))
finishedJob = true
return nil, errors.New("Error!")
}
if err := otherWorker.AddFunc("test", failureJob, 0); err != nil {
t.Error(err)
}
if err := otherWorker.Ready(); err != nil {
t.Error(err)
return
}
submitEmptyInPack(t, otherWorker, "test")
go otherWorker.Work()
// Wait for the failure_job to start so that we know we didn't shutdown before even
// beginning to process the job.
wg.Add(1)
wg.Wait()
otherWorker.Shutdown()
if !finishedJob {
t.Error("Didn't finish the failed job")
}
}
func TestSubmitMultipleJobs(t *testing.T) {
otherWorker := initWorker(t)
var startJobs sync.WaitGroup
startJobs.Add(2)
var jobsFinished int32 = 0
job := func(job Job) ([]byte, error) {
startJobs.Done()
// Sleep for 10ms to ensure that the shutdown waits for this to finish
time.Sleep(time.Duration(10 * time.Millisecond))
atomic.AddInt32(&jobsFinished, 1)
return nil, nil
}
if err := otherWorker.AddFunc("test", job, 0); err != nil {
t.Error(err)
}
if err := otherWorker.Ready(); err != nil {
t.Error(err)
return
}
submitEmptyInPack(t, otherWorker, "test")
submitEmptyInPack(t, otherWorker, "test")
go otherWorker.Work()
startJobs.Wait()
otherWorker.Shutdown()
if jobsFinished != 2 {
t.Error("Didn't run both jobs")
}
}
func TestSubmitJobAfterShutdown(t *testing.T) {
otherWorker := initWorker(t)
noRunJob := func(job Job) ([]byte, error) {
t.Error("This job shouldn't have been run")
return nil, nil
}
if err := otherWorker.AddFunc("test", noRunJob, 0); err != nil {
t.Error(err)
}
if err := otherWorker.Ready(); err != nil {
t.Error(err)
return
}
go otherWorker.Work()
otherWorker.Shutdown()
submitEmptyInPack(t, otherWorker, "test")
// Sleep for 10ms to make sure that the job doesn't run
time.Sleep(time.Duration(10 * time.Millisecond))
}