@@ -30,11 +30,13 @@ const (
30
30
)
31
31
32
32
type Process struct {
33
- ID string
34
- config ModelConfig
35
- cmd * exec.Cmd
36
- logMonitor * LogMonitor
37
- healthCheckTimeout int
33
+ ID string
34
+ config ModelConfig
35
+ cmd * exec.Cmd
36
+ logMonitor * LogMonitor
37
+
38
+ healthCheckTimeout int
39
+ healthCheckLoopInterval time.Duration
38
40
39
41
lastRequestHandled time.Time
40
42
@@ -54,51 +56,57 @@ type Process struct {
54
56
func NewProcess (ID string , healthCheckTimeout int , config ModelConfig , logMonitor * LogMonitor ) * Process {
55
57
ctx , cancel := context .WithCancel (context .Background ())
56
58
return & Process {
57
- ID : ID ,
58
- config : config ,
59
- cmd : nil ,
60
- logMonitor : logMonitor ,
61
- healthCheckTimeout : healthCheckTimeout ,
62
- state : StateStopped ,
63
- shutdownCtx : ctx ,
64
- shutdownCancel : cancel ,
59
+ ID : ID ,
60
+ config : config ,
61
+ cmd : nil ,
62
+ logMonitor : logMonitor ,
63
+ healthCheckTimeout : healthCheckTimeout ,
64
+ healthCheckLoopInterval : 5 * time .Second , /* default, can not be set by user - used for testing */
65
+ state : StateStopped ,
66
+ shutdownCtx : ctx ,
67
+ shutdownCancel : cancel ,
65
68
}
66
69
}
67
70
68
- func (p * Process ) setState (newState ProcessState ) error {
69
- // enforce valid state transitions
70
- invalidTransition := false
71
- if p .state == StateStopped {
72
- // stopped -> starting
73
- if newState != StateStarting {
74
- invalidTransition = true
75
- }
76
- } else if p .state == StateStarting {
77
- // starting -> ready | failed | stopping
78
- if newState != StateReady && newState != StateFailed && newState != StateStopping {
79
- invalidTransition = true
80
- }
81
- } else if p .state == StateReady {
82
- // ready -> stopping
83
- if newState != StateStopping {
84
- invalidTransition = true
85
- }
86
- } else if p .state == StateStopping {
87
- // stopping -> stopped | shutdown
88
- if newState != StateStopped && newState != StateShutdown {
89
- invalidTransition = true
90
- }
91
- } else if p .state == StateFailed || p .state == StateShutdown {
92
- invalidTransition = true
71
+ // custom error types for swapping state
72
+ var (
73
+ ErrExpectedStateMismatch = errors .New ("expected state mismatch" )
74
+ ErrInvalidStateTransition = errors .New ("invalid state transition" )
75
+ )
76
+
77
+ // swapState performs a compare and swap of the state atomically. It returns the current state
78
+ // and an error if the swap failed.
79
+ func (p * Process ) swapState (expectedState , newState ProcessState ) (ProcessState , error ) {
80
+ p .stateMutex .Lock ()
81
+ defer p .stateMutex .Unlock ()
82
+
83
+ if p .state != expectedState {
84
+ return p .state , ErrExpectedStateMismatch
93
85
}
94
86
95
- if invalidTransition {
96
- //panic(fmt.Sprintf("Invalid state transition from %s to %s", p.state, newState))
97
- return fmt .Errorf ("invalid state transition from %s to %s" , p .state , newState )
87
+ if ! isValidTransition (p .state , newState ) {
88
+ return p .state , ErrInvalidStateTransition
98
89
}
99
90
100
91
p .state = newState
101
- return nil
92
+ return p .state , nil
93
+ }
94
+
95
+ // Helper function to encapsulate transition rules
96
+ func isValidTransition (from , to ProcessState ) bool {
97
+ switch from {
98
+ case StateStopped :
99
+ return to == StateStarting
100
+ case StateStarting :
101
+ return to == StateReady || to == StateFailed || to == StateStopping
102
+ case StateReady :
103
+ return to == StateStopping
104
+ case StateStopping :
105
+ return to == StateStopped || to == StateShutdown
106
+ case StateFailed , StateShutdown :
107
+ return false // No transitions allowed from these states
108
+ }
109
+ return false
102
110
}
103
111
104
112
func (p * Process ) CurrentState () ProcessState {
@@ -116,65 +124,48 @@ func (p *Process) start() error {
116
124
return fmt .Errorf ("can not start(), upstream proxy missing" )
117
125
}
118
126
119
- // multiple start() calls will wait for the one that is actually starting to
120
- // complete before proceeding.
121
- // ===========
122
- curState := p .CurrentState ()
123
-
124
- if curState == StateReady {
125
- return nil
126
- }
127
-
128
- if curState == StateStarting {
129
- p .waitStarting .Wait ()
130
-
131
- if state := p .CurrentState (); state != StateReady {
132
- return fmt .Errorf ("start() failed current state: %v" , state )
133
- }
134
-
135
- return nil
127
+ args , err := p .config .SanitizedCommand ()
128
+ if err != nil {
129
+ return fmt .Errorf ("unable to get sanitized command: %v" , err )
136
130
}
137
- // ===========
138
-
139
- // There is the possibility of a hard to replicate race condition where
140
- // curState *WAS* StateStopped but by the time we get to the p.stateMutex.Lock()
141
- // below, it's value has changed!
142
-
143
- p .stateMutex .Lock ()
144
- defer p .stateMutex .Unlock ()
145
131
146
- // with the exclusive lock, check if p.state is StateStopped, which is the only valid state
147
- // to transition from to StateReady
148
-
149
- if p .state != StateStopped {
150
- if p .state == StateReady {
151
- return nil
132
+ if curState , err := p .swapState (StateStopped , StateStarting ); err != nil {
133
+ if err == ErrExpectedStateMismatch {
134
+ // already starting, just wait for it to complete and expect
135
+ // it to be be in the Ready start after. If not, return an error
136
+ if curState == StateStarting {
137
+ p .waitStarting .Wait ()
138
+ if state := p .CurrentState (); state == StateReady {
139
+ return nil
140
+ } else {
141
+ return fmt .Errorf ("process was already starting but wound up in state %v" , state )
142
+ }
143
+ } else {
144
+ return fmt .Errorf ("processes was in state %v when start() was called" , curState )
145
+ }
152
146
} else {
153
- return fmt .Errorf ("start() can not proceed expected StateReady but process is in %v" , p . state )
147
+ return fmt .Errorf ("failed to set Process state to starting: current state: %v, error: %v" , curState , err )
154
148
}
155
149
}
156
150
157
- if err := p .setState (StateStarting ); err != nil {
158
- return err
159
- }
160
-
161
151
p .waitStarting .Add (1 )
162
152
defer p .waitStarting .Done ()
163
153
164
- args , err := p .config .SanitizedCommand ()
165
- if err != nil {
166
- return fmt .Errorf ("unable to get sanitized command: %v" , err )
167
- }
168
-
169
154
p .cmd = exec .Command (args [0 ], args [1 :]... )
170
155
p .cmd .Stdout = p .logMonitor
171
156
p .cmd .Stderr = p .logMonitor
172
157
p .cmd .Env = p .config .Env
173
158
174
159
err = p .cmd .Start ()
175
160
161
+ // Set process state to failed
176
162
if err != nil {
177
- p .setState (StateFailed )
163
+ if curState , swapErr := p .swapState (StateStarting , StateFailed ); err != nil {
164
+ return fmt .Errorf (
165
+ "failed to start command and state swap failed. command error: %v, current state: %v, state swap error: %v" ,
166
+ err , curState , swapErr ,
167
+ )
168
+ }
178
169
return fmt .Errorf ("start() failed: %v" , err )
179
170
}
180
171
@@ -209,13 +200,16 @@ func (p *Process) start() error {
209
200
)
210
201
defer cancelHealthCheck ()
211
202
212
- // Health check loop
213
203
loop:
204
+ // Ready Check loop
214
205
for {
215
206
select {
216
207
case <- checkDeadline .Done ():
217
- p .setState (StateFailed )
218
- return fmt .Errorf ("health check failed after %vs" , maxDuration .Seconds ())
208
+ if curState , err := p .swapState (StateStarting , StateFailed ); err != nil {
209
+ return fmt .Errorf ("health check timed out after %vs AND state swap failed: %v, current state: %v" , maxDuration .Seconds (), err , curState )
210
+ } else {
211
+ return fmt .Errorf ("health check timed out after %vs" , maxDuration .Seconds ())
212
+ }
219
213
case <- p .shutdownCtx .Done ():
220
214
return errors .New ("health check interrupted due to shutdown" )
221
215
default :
@@ -233,7 +227,7 @@ func (p *Process) start() error {
233
227
}
234
228
}
235
229
236
- <- time .After (5 * time . Second )
230
+ <- time .After (p . healthCheckLoopInterval )
237
231
}
238
232
}
239
233
@@ -244,7 +238,7 @@ func (p *Process) start() error {
244
238
maxDuration := time .Duration (p .config .UnloadAfter ) * time .Second
245
239
246
240
for range time .Tick (time .Second ) {
247
- if p .state != StateReady {
241
+ if p .CurrentState () != StateReady {
248
242
return
249
243
}
250
244
@@ -260,46 +254,38 @@ func (p *Process) start() error {
260
254
}()
261
255
}
262
256
263
- return p .setState (StateReady )
257
+ if curState , err := p .swapState (StateStarting , StateReady ); err != nil {
258
+ return fmt .Errorf ("failed to set Process state to ready: current state: %v, error: %v" , curState , err )
259
+ } else {
260
+ return nil
261
+ }
264
262
}
265
263
266
264
func (p * Process ) Stop () {
267
265
// wait for any inflight requests before proceeding
268
266
p .inFlightRequests .Wait ()
269
- p .stateMutex .Lock ()
270
- defer p .stateMutex .Unlock ()
271
267
272
268
// calling Stop() when state is invalid is a no-op
273
- if err := p .setState ( StateStopping ); err != nil {
274
- fmt .Fprintf (p .logMonitor , "!!! Info - Stop() err: %v\n " , err )
269
+ if curState , err := p .swapState ( StateReady , StateStopping ); err != nil {
270
+ fmt .Fprintf (p .logMonitor , "!!! Info - Stop() Ready -> StateStopping err: %v, current state: %v \n " , err , curState )
275
271
return
276
272
}
277
273
278
274
// stop the process with a graceful exit timeout
279
275
p .stopCommand (5 * time .Second )
280
276
281
- if err := p .setState ( StateStopped ); err != nil {
282
- panic ( fmt .Sprintf ( " Stop() failed to set state to stopped : %v" , err ) )
277
+ if curState , err := p .swapState ( StateStopping , StateStopped ); err != nil {
278
+ fmt .Fprintf ( p . logMonitor , "!!! Info - Stop() StateStopping -> StateStopped err: %v, current state : %v\n " , err , curState )
283
279
}
284
280
}
285
281
286
282
// Shutdown is called when llama-swap is shutting down. It will give a little bit
287
283
// of time for any inflight requests to complete before shutting down. If the Process
288
284
// is in the state of starting, it will cancel it and shut it down
289
285
func (p * Process ) Shutdown () {
290
- // cancel anything that can be interrupted by a shutdown (ie: healthcheck)
291
286
p .shutdownCancel ()
292
-
293
- p .stateMutex .Lock ()
294
- defer p .stateMutex .Unlock ()
295
- p .setState (StateStopping )
296
-
297
- // 5 seconds to stop the process
298
287
p .stopCommand (5 * time .Second )
299
- if err := p .setState (StateShutdown ); err != nil {
300
- fmt .Printf ("!!! Shutdown() failed to set state to shutdown: %v" , err )
301
- }
302
- p .setState (StateShutdown )
288
+ p .state = StateShutdown
303
289
}
304
290
305
291
// stopCommand will send a SIGTERM to the process and wait for it to exit.
0 commit comments