Skip to content

Commit 6533f88

Browse files
BinBin Hesingholt
BinBin He
authored andcommitted
Terminally exit on unrecoverable exceptions for RCI.
1 parent 5073adb commit 6533f88

File tree

3 files changed

+217
-26
lines changed

3 files changed

+217
-26
lines changed

agent/app/agent.go

+33-10
Original file line numberDiff line numberDiff line change
@@ -468,10 +468,15 @@ func (agent *ecsAgent) doStart(containerChangeEventStream *eventstream.EventStre
468468
// Register the container instance
469469
err = agent.registerContainerInstance(client, vpcSubnetAttributes)
470470
if err != nil {
471-
if isTransient(err) {
472-
return exitcodes.ExitError
471+
if isTerminal(err) {
472+
// On unrecoverable error codes, agent should terminally exit.
473+
logger.Critical("Agent will terminally exit, unable to register container instance:", logger.Fields{
474+
field.Error: err,
475+
})
476+
return exitcodes.ExitTerminal
473477
}
474-
return exitcodes.ExitTerminal
478+
// Other errors are considered recoverable and will be retried.
479+
return exitcodes.ExitError
475480
}
476481

477482
// Load Managed Daemon images asynchronously
@@ -855,13 +860,19 @@ func (agent *ecsAgent) registerContainerInstance(
855860
field.Error: err,
856861
})
857862
if retriable, ok := err.(apierrors.Retriable); ok && !retriable.Retry() {
858-
return err
863+
return terminalError{err}
859864
}
860865
if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeInvalidParameterException) {
861866
logger.Critical("Instance registration attempt with an invalid parameter", logger.Fields{
862867
field.Error: err,
863868
})
864-
return err
869+
return terminalError{err}
870+
}
871+
if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeClientException) {
872+
logger.Critical("Instance registration attempt with client performing invalid action", logger.Fields{
873+
field.Error: err,
874+
})
875+
return terminalError{err}
865876
}
866877
if _, ok := err.(apierrors.AttributeError); ok {
867878
attributeErrorMsg := ""
@@ -871,9 +882,9 @@ func (agent *ecsAgent) registerContainerInstance(
871882
logger.Critical("Instance registration attempt with invalid attribute(s)", logger.Fields{
872883
field.Error: attributeErrorMsg,
873884
})
874-
return err
885+
return terminalError{err}
875886
}
876-
return transientError{err}
887+
return err
877888
}
878889
logger.Info("Instance registration completed successfully", logger.Fields{
879890
"instanceArn": containerInstanceArn,
@@ -903,7 +914,19 @@ func (agent *ecsAgent) reregisterContainerInstance(client ecs.ECSClient, capabil
903914
})
904915
if apierrors.IsInstanceTypeChangedError(err) {
905916
seelog.Criticalf(instanceTypeMismatchErrorFormat, err)
906-
return err
917+
return terminalError{err}
918+
}
919+
if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeInvalidParameterException) {
920+
logger.Critical("Instance re-registration attempt with an invalid parameter", logger.Fields{
921+
field.Error: err,
922+
})
923+
return terminalError{err}
924+
}
925+
if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeClientException) {
926+
logger.Critical("Instance re-registration attempt with client performing invalid action", logger.Fields{
927+
field.Error: err,
928+
})
929+
return terminalError{err}
907930
}
908931
if _, ok := err.(apierrors.AttributeError); ok {
909932
attributeErrorMsg := ""
@@ -913,9 +936,9 @@ func (agent *ecsAgent) reregisterContainerInstance(client ecs.ECSClient, capabil
913936
logger.Critical("Instance re-registration attempt with invalid attribute(s)", logger.Fields{
914937
field.Error: attributeErrorMsg,
915938
})
916-
return err
939+
return terminalError{err}
917940
}
918-
return transientError{err}
941+
return err
919942
}
920943

921944
// startAsyncRoutines starts all background methods

agent/app/agent_test.go

+177-10
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,7 @@ func TestNewTaskEngineRestoreFromCheckpointNewStateManagerError(t *testing.T) {
760760

761761
ec2MetadataClient := mock_ec2.NewMockEC2MetadataClient(ctrl)
762762
mockPauseLoader := mock_loader.NewMockLoader(ctrl)
763+
newError := errors.New("error")
763764
gomock.InOrder(
764765
saveableOptionFactory.EXPECT().AddSaveable("TaskEngine", gomock.Any()).Return(nil),
765766
saveableOptionFactory.EXPECT().AddSaveable("ContainerInstanceArn", gomock.Any()).Return(nil),
@@ -770,7 +771,7 @@ func TestNewTaskEngineRestoreFromCheckpointNewStateManagerError(t *testing.T) {
770771

771772
stateManagerFactory.EXPECT().NewStateManager(gomock.Any(), gomock.Any(),
772773
gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(
773-
nil, errors.New("error")),
774+
nil, newError),
774775
)
775776

776777
dataClient := newTestDataClient(t)
@@ -795,7 +796,6 @@ func TestNewTaskEngineRestoreFromCheckpointNewStateManagerError(t *testing.T) {
795796
_, _, err := agent.newTaskEngine(eventstream.NewEventStream("events", ctx),
796797
credentialsManager, dockerstate.NewTaskEngineState(), imageManager, hostResources, execCmdMgr, serviceConnectManager, daemonManagers)
797798
assert.Error(t, err)
798-
assert.False(t, isTransient(err))
799799
}
800800

801801
func TestNewTaskEngineRestoreFromCheckpointStateLoadError(t *testing.T) {
@@ -808,6 +808,7 @@ func TestNewTaskEngineRestoreFromCheckpointStateLoadError(t *testing.T) {
808808
cfg.Checkpoint = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
809809
ec2MetadataClient := mock_ec2.NewMockEC2MetadataClient(ctrl)
810810
mockPauseLoader := mock_loader.NewMockLoader(ctrl)
811+
newError := errors.New("error")
811812

812813
gomock.InOrder(
813814
saveableOptionFactory.EXPECT().AddSaveable("TaskEngine", gomock.Any()).Return(nil),
@@ -819,7 +820,7 @@ func TestNewTaskEngineRestoreFromCheckpointStateLoadError(t *testing.T) {
819820
stateManagerFactory.EXPECT().NewStateManager(gomock.Any(),
820821
gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(),
821822
).Return(stateManager, nil),
822-
stateManager.EXPECT().Load().Return(errors.New("error")),
823+
stateManager.EXPECT().Load().Return(newError),
823824
)
824825

825826
dataClient := newTestDataClient(t)
@@ -844,7 +845,6 @@ func TestNewTaskEngineRestoreFromCheckpointStateLoadError(t *testing.T) {
844845
_, _, err := agent.newTaskEngine(eventstream.NewEventStream("events", ctx),
845846
credentialsManager, dockerstate.NewTaskEngineState(), imageManager, hostResources, execCmdMgr, serviceConnectManager, daemonManagers)
846847
assert.Error(t, err)
847-
assert.False(t, isTransient(err))
848848
}
849849

850850
func TestNewTaskEngineRestoreFromCheckpoint(t *testing.T) {
@@ -1086,7 +1086,7 @@ func TestReregisterContainerInstanceInstanceTypeChanged(t *testing.T) {
10861086

10871087
err := agent.registerContainerInstance(client, nil)
10881088
assert.Error(t, err)
1089-
assert.False(t, isTransient(err))
1089+
assert.True(t, isTerminal(err))
10901090
}
10911091

10921092
func TestReregisterContainerInstanceAttributeError(t *testing.T) {
@@ -1145,7 +1145,7 @@ func TestReregisterContainerInstanceAttributeError(t *testing.T) {
11451145

11461146
err := agent.registerContainerInstance(client, nil)
11471147
assert.Error(t, err)
1148-
assert.False(t, isTransient(err))
1148+
assert.True(t, isTerminal(err))
11491149
}
11501150

11511151
func TestReregisterContainerInstanceNonTerminalError(t *testing.T) {
@@ -1204,7 +1204,66 @@ func TestReregisterContainerInstanceNonTerminalError(t *testing.T) {
12041204

12051205
err := agent.registerContainerInstance(client, nil)
12061206
assert.Error(t, err)
1207-
assert.True(t, isTransient(err))
1207+
assert.False(t, isTerminal(err))
1208+
}
1209+
1210+
func TestReregisterContainerInstanceTerminalError(t *testing.T) {
1211+
ctrl := gomock.NewController(t)
1212+
defer ctrl.Finish()
1213+
1214+
mockDockerClient := mock_dockerapi.NewMockDockerClient(ctrl)
1215+
client := mock_ecs.NewMockECSClient(ctrl)
1216+
mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl)
1217+
mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl)
1218+
mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl)
1219+
mockPauseLoader := mock_loader.NewMockLoader(ctrl)
1220+
1221+
mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes()
1222+
mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
1223+
mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl)
1224+
mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes()
1225+
mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes()
1226+
mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes()
1227+
mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes()
1228+
1229+
mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl)
1230+
mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager}
1231+
mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes()
1232+
mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
1233+
1234+
gomock.InOrder(
1235+
mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil),
1236+
mockDockerClient.EXPECT().SupportedVersions().Return(apiVersions),
1237+
mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil),
1238+
mockDockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(),
1239+
gomock.Any(), gomock.Any()).AnyTimes().Return([]string{}, nil),
1240+
client.EXPECT().RegisterContainerInstance(containerInstanceARN, gomock.Any(), gomock.Any(), gomock.Any(),
1241+
gomock.Any(), gomock.Any()).Return("", "", awserr.New("ClientException", "", nil)),
1242+
)
1243+
mockEC2Metadata.EXPECT().OutpostARN().Return("", nil)
1244+
1245+
cfg := getTestConfig()
1246+
cfg.Cluster = clusterName
1247+
ctx, cancel := context.WithCancel(context.TODO())
1248+
// Cancel the context to cancel async routines
1249+
defer cancel()
1250+
agent := &ecsAgent{
1251+
ctx: ctx,
1252+
cfg: &cfg,
1253+
dockerClient: mockDockerClient,
1254+
ec2MetadataClient: mockEC2Metadata,
1255+
pauseLoader: mockPauseLoader,
1256+
credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider),
1257+
mobyPlugins: mockMobyPlugins,
1258+
serviceconnectManager: mockServiceConnectManager,
1259+
daemonManagers: mockDaemonManagers,
1260+
}
1261+
agent.containerInstanceARN = containerInstanceARN
1262+
agent.availabilityZone = availabilityZone
1263+
1264+
err := agent.registerContainerInstance(client, nil)
1265+
assert.Error(t, err)
1266+
assert.True(t, isTerminal(err))
12081267
}
12091268

12101269
func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetHappyPath(t *testing.T) {
@@ -1320,7 +1379,7 @@ func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetCanRetryError(
13201379

13211380
err := agent.registerContainerInstance(client, nil)
13221381
assert.Error(t, err)
1323-
assert.True(t, isTransient(err))
1382+
assert.False(t, isTerminal(err))
13241383
}
13251384

13261385
func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetCannotRetryError(t *testing.T) {
@@ -1378,7 +1437,7 @@ func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetCannotRetryErr
13781437

13791438
err := agent.registerContainerInstance(client, nil)
13801439
assert.Error(t, err)
1381-
assert.False(t, isTransient(err))
1440+
assert.True(t, isTerminal(err))
13821441
}
13831442

13841443
func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetAttributeError(t *testing.T) {
@@ -1435,7 +1494,7 @@ func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetAttributeError
14351494

14361495
err := agent.registerContainerInstance(client, nil)
14371496
assert.Error(t, err)
1438-
assert.False(t, isTransient(err))
1497+
assert.True(t, isTerminal(err))
14391498
}
14401499

14411500
func TestRegisterContainerInstanceInvalidParameterTerminalError(t *testing.T) {
@@ -1499,6 +1558,114 @@ func TestRegisterContainerInstanceInvalidParameterTerminalError(t *testing.T) {
14991558
credentialsManager, state, imageManager, client, execCmdMgr)
15001559
assert.Equal(t, exitcodes.ExitTerminal, exitCode)
15011560
}
1561+
1562+
func TestRegisterContainerInstanceExceptionErrors(t *testing.T) {
1563+
testCases := []struct {
1564+
name string
1565+
regError error
1566+
exitCode int
1567+
}{
1568+
{
1569+
name: "InvalidParameterException",
1570+
regError: awserr.New("InvalidParameterException", "", nil),
1571+
exitCode: exitcodes.ExitTerminal,
1572+
},
1573+
{
1574+
name: "ClientException",
1575+
regError: awserr.New("ClientException", "", nil),
1576+
exitCode: exitcodes.ExitTerminal,
1577+
},
1578+
{
1579+
name: "ThrottlingException",
1580+
regError: awserr.New("ThrottlingException", "", nil),
1581+
exitCode: exitcodes.ExitError,
1582+
},
1583+
}
1584+
1585+
for _, tc := range testCases {
1586+
t.Run(tc.name, func(t *testing.T) {
1587+
ctrl, credentialsManager, state, imageManager, client,
1588+
dockerClient, _, _, execCmdMgr, _ := setup(t)
1589+
defer ctrl.Finish()
1590+
1591+
mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl)
1592+
mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl)
1593+
mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl)
1594+
mockPauseLoader := mock_loader.NewMockLoader(ctrl)
1595+
mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl)
1596+
mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl)
1597+
mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager}
1598+
1599+
mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes()
1600+
mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
1601+
1602+
mockEC2Metadata.EXPECT().PrimaryENIMAC().Return("mac", nil)
1603+
mockEC2Metadata.EXPECT().VPCID("mac").Return("vpc-id", nil)
1604+
mockEC2Metadata.EXPECT().SubnetID("mac").Return("subnet-id", nil)
1605+
mockEC2Metadata.EXPECT().OutpostARN().Return("", nil)
1606+
1607+
mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes()
1608+
mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes()
1609+
mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes()
1610+
mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes()
1611+
1612+
mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes()
1613+
mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes()
1614+
1615+
dockerClient.EXPECT().SupportedVersions().Return(apiVersions).AnyTimes()
1616+
1617+
gomock.InOrder(
1618+
client.EXPECT().GetHostResources().Return(testHostResource, nil),
1619+
mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil),
1620+
mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil),
1621+
dockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(), gomock.Any(),
1622+
gomock.Any()).AnyTimes().Return([]string{}, nil),
1623+
1624+
client.EXPECT().
1625+
RegisterContainerInstance(
1626+
gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(),
1627+
gomock.Any(), gomock.Any(),
1628+
).
1629+
Return("", "", tc.regError),
1630+
)
1631+
1632+
cfg := getTestConfig()
1633+
ctx, cancel := context.WithCancel(context.TODO())
1634+
defer cancel()
1635+
1636+
agent := &ecsAgent{
1637+
ctx: ctx,
1638+
ec2MetadataClient: mockEC2Metadata,
1639+
cfg: &cfg,
1640+
pauseLoader: mockPauseLoader,
1641+
credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider),
1642+
dockerClient: dockerClient,
1643+
mobyPlugins: mockMobyPlugins,
1644+
terminationHandler: func(
1645+
taskEngineState dockerstate.TaskEngineState,
1646+
dataClient data.Client,
1647+
taskEngine engine.TaskEngine,
1648+
cancel context.CancelFunc,
1649+
) {
1650+
},
1651+
serviceconnectManager: mockServiceConnectManager,
1652+
daemonManagers: mockDaemonManagers,
1653+
}
1654+
1655+
exitCode := agent.doStart(
1656+
eventstream.NewEventStream("events", ctx),
1657+
credentialsManager,
1658+
state,
1659+
imageManager,
1660+
client,
1661+
execCmdMgr,
1662+
)
1663+
1664+
assert.Equal(t, tc.exitCode, exitCode)
1665+
})
1666+
}
1667+
}
1668+
15021669
func TestMergeTags(t *testing.T) {
15031670
ec2Key := "ec2Key"
15041671
ec2Value := "ec2Value"

agent/app/errors.go

+7-6
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,16 @@
1313

1414
package app
1515

16-
// transientError represents a transient error when executing the ECS Agent
17-
type transientError struct {
16+
type terminalError struct {
1817
error
1918
}
2019

21-
// isTransient returns true if the error is transient
22-
func isTransient(err error) bool {
23-
_, ok := err.(transientError)
24-
return ok
20+
// isTerminal returns true if the error is already wrapped as an unrecoverable condition
21+
// which will allow agent to exit terminally.
22+
func isTerminal(err error) bool {
23+
// Check if the error is already wrapped as a terminalError
24+
_, terminal := err.(terminalError)
25+
return terminal
2526
}
2627

2728
// clusterMismatchError represents a mismatch in cluster name between the

0 commit comments

Comments
 (0)