From 9d3eb620a20873a6d683c96f7251c443348fe374 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 20 Nov 2025 19:34:59 -0800 Subject: [PATCH 01/13] add fault domain retry logic --- .gitignore | 6 +- cloud/ociutil/ociutil.go | 35 +++++++ cloud/ociutil/ociutil_test.go | 63 ++++++++++++ cloud/scope/machine.go | 119 +++++++++++++++++++++-- cloud/scope/machine_test.go | 176 ++++++++++++++++++++++++++++++++++ 5 files changed, 391 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 3f29d9587..7a3f22d52 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,8 @@ tilt-settings.json # git worktrees -worktrees/ \ No newline at end of file +worktrees/capoci-example-cluster.kubeconfig +capoci-example-cluster.yaml +env.sh +provider-config-example.yaml +capoci-example-cluster.kubeconfig diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index 93976190a..93e85403b 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "net/http" + "strings" "time" lb "github.com/oracle/cluster-api-provider-oci/cloud/services/loadbalancer" @@ -172,6 +173,40 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { return tags } +// NOTE: Currently we only key off the documented "Out of host capacity" error. +// If OCI starts surfacing additional codes/messages we can expand this list. +// Reference: https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/troubleshooting-out-of-host-capacity.htm +var outOfCapacityErrorCodes = map[string]struct{}{ + "OUTOFHOSTCAPACITY": {}, +} + +var outOfCapacityErrorMessages = []string{ + "out of host capacity", +} + +// IsOutOfHostCapacityError returns true when the OCI service error indicates that the fault domain ran out of capacity. +func IsOutOfHostCapacityError(err error) bool { + if err == nil { + return false + } + err = errors.Cause(err) + serviceErr, ok := common.IsServiceError(err) + if !ok { + return false + } + code := serviceErr.GetCode() + if _, found := outOfCapacityErrorCodes[strings.ToUpper(code)]; found { + return true + } + message := strings.ToLower(serviceErr.GetMessage()) + for _, fragment := range outOfCapacityErrorMessages { + if strings.Contains(message, fragment) { + return true + } + } + return false +} + // DerefString returns the string value if the pointer isn't nil, otherwise returns empty string func DerefString(s *string) string { if s != nil { diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index 37449c0b1..0cd9a567b 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -83,3 +83,66 @@ func TestAddToDefaultClusterTags(t *testing.T) { } } } + +func TestIsOutOfHostCapacityError(t *testing.T) { + testCases := []struct { + name string + err error + expected bool + }{ + { + name: "matches by code", + err: fakeServiceError{code: "OutOfHostCapacity", message: "any"}, + expected: true, + }, + { + name: "matches by message", + err: fakeServiceError{code: "Other", message: "Instance launch failed due to out of host capacity"}, + expected: true, + }, + { + name: "non service error", + err: fmt.Errorf("boom"), + expected: false, + }, + { + name: "nil error", + err: nil, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual := IsOutOfHostCapacityError(tc.err) + if actual != tc.expected { + t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name) + } + }) + } +} + +type fakeServiceError struct { + code string + message string +} + +func (f fakeServiceError) Error() string { + return f.message +} + +func (f fakeServiceError) GetHTTPStatusCode() int { + return 400 +} + +func (f fakeServiceError) GetMessage() string { + return f.message +} + +func (f fakeServiceError) GetCode() string { + return f.code +} + +func (f fakeServiceError) GetOpcRequestID() string { + return "" +} diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 64d581630..96fa3578a 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -23,6 +23,7 @@ import ( "fmt" "math/big" "net/url" + "sort" "strconv" "github.com/go-logr/logr" @@ -82,6 +83,11 @@ type MachineScope struct { WorkRequestsClient wr.Client } +type faultDomainAttempt struct { + AvailabilityDomain string + FaultDomain string +} + // NewMachineScope creates a MachineScope given the MachineScopeParams func NewMachineScope(params MachineScopeParams) (*MachineScope, error) { if params.Machine == nil { @@ -232,6 +238,7 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, // the random number generated is between zero and two, whereas we need a number between one and three failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) availabilityDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[AvailabilityDomain] + faultDomain = m.OCIClusterAccessor.GetFailureDomains()[*failureDomain].Attributes[FaultDomain] } metadata := m.OCIMachine.Spec.Metadata @@ -287,14 +294,112 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() - req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails, - OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} - resp, err := m.ComputeClient.LaunchInstance(ctx, req) - if err != nil { - return nil, err - } else { - return &resp.Instance, nil + // Build the list of availability/fault domain combinations we will try + // when launching the instance (primary FD first, then fallbacks). + faultDomains := m.buildFaultDomainLaunchAttempts(availabilityDomain, faultDomain) + return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) +} + +func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, attempts []faultDomainAttempt) (*core.Instance, error) { + if len(attempts) == 0 { + attempts = append(attempts, faultDomainAttempt{ + AvailabilityDomain: ociutil.DerefString(baseDetails.AvailabilityDomain), + }) + } + + opcRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) + var lastErr error + for idx, attempt := range attempts { + details := baseDetails + if attempt.AvailabilityDomain != "" { + details.AvailabilityDomain = common.String(attempt.AvailabilityDomain) + } + if attempt.FaultDomain != "" { + details.FaultDomain = common.String(attempt.FaultDomain) + } else { + details.FaultDomain = nil + } + + resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ + LaunchInstanceDetails: details, + OpcRetryToken: opcRetryToken, + }) + if err == nil { + return &resp.Instance, nil + } + lastAttempt := idx == len(attempts)-1 + if !ociutil.IsOutOfHostCapacityError(err) || lastAttempt { + return nil, err + } + lastErr = err + m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", attempt.FaultDomain) } + return nil, lastErr +} + +const defaultFaultDomainKey = "__no_fault_domain__" + +func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initialFaultDomain string) []faultDomainAttempt { + var attempts []faultDomainAttempt + seen := make(map[string]bool) + addAttempt := func(fd string) { + key := fd + if fd == "" { + key = defaultFaultDomainKey + } + if seen[key] { + return + } + seen[key] = true + attempts = append(attempts, faultDomainAttempt{ + AvailabilityDomain: availabilityDomain, + FaultDomain: fd, + }) + } + + addAttempt(initialFaultDomain) + if availabilityDomain == "" { + return attempts + } + + // Prefer fault domains exposed via the Cluster status. This respects + // Cluster API's scheduling decisions before falling back to raw OCI data. + failureDomains := m.OCIClusterAccessor.GetFailureDomains() + if len(failureDomains) > 0 { + keys := make([]string, 0, len(failureDomains)) + for key := range failureDomains { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + spec := failureDomains[key] + if spec.Attributes[AvailabilityDomain] != availabilityDomain { + continue + } + fd := spec.Attributes[FaultDomain] + if fd == "" { + continue + } + addAttempt(fd) + } + } + + if len(attempts) > 1 { + return attempts + } + + // If the cluster status didn't enumerate any additional fault domains, + // fall back to the cached availability-domain data gathered from OCI so we + // can still iterate through every physical fault domain in that AD. + if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { + if adEntry, ok := adMap[availabilityDomain]; ok { + for _, fd := range adEntry.FaultDomains { + addAttempt(fd) + } + } + } + + return attempts } func (m *MachineScope) getFreeFormTags() map[string]string { diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index c7086394e..1593386f2 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -320,6 +320,130 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.LaunchInstanceResponse{}, nil) }, }, + { + name: "retries alternate fault domains on capacity errors", + errorExpected: false, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{ + Instance: core.Instance{ + Id: common.String("instance-id"), + }, + }, nil) + + gomock.InOrder(first, second) + }, + }, + { + name: "returns error after exhausting all fault domains", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd2")) + + gomock.InOrder(first, second) + }, + }, + { + name: "retries fault domains discovered from availability domain cache", + errorExpected: false, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + }, + }, + } + ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ + "ad1": { + Name: "ad1", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2"}, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + third := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{ + Instance: core.Instance{ + Id: common.String("instance-id"), + }, + }, nil) + + gomock.InOrder(first, second, third) + }, + }, { name: "check all params together", errorExpected: false, @@ -1487,6 +1611,26 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error { return nil } +func faultDomainRequestMatcher(request interface{}, matchStr string) error { + r, ok := request.(core.LaunchInstanceRequest) + if !ok { + return errors.New("expecting LaunchInstanceRequest type") + } + if matchStr == "" { + if r.LaunchInstanceDetails.FaultDomain != nil { + return errors.New("expected fault domain to be nil") + } + return nil + } + if r.LaunchInstanceDetails.FaultDomain == nil { + return errors.New("fault domain was nil") + } + if *r.LaunchInstanceDetails.FaultDomain != matchStr { + return errors.New(fmt.Sprintf("expecting fault domain %s but got %s", matchStr, *r.LaunchInstanceDetails.FaultDomain)) + } + return nil +} + func platformConfigMatcher(actual interface{}, expected core.PlatformConfig) error { r, ok := actual.(core.LaunchInstanceRequest) if !ok { @@ -2925,3 +3069,35 @@ func setupAllParams(ms *MachineScope) { ociCluster.Spec.OCIResourceIdentifier = "resource_uid" ms.OCIMachine.UID = "machineuid" } + +type testServiceError struct { + code string + message string +} + +func newOutOfCapacityServiceError(message string) error { + return testServiceError{ + code: "OutOfHostCapacity", + message: message, + } +} + +func (t testServiceError) Error() string { + return t.message +} + +func (t testServiceError) GetHTTPStatusCode() int { + return 400 +} + +func (t testServiceError) GetMessage() string { + return t.message +} + +func (t testServiceError) GetCode() string { + return t.code +} + +func (t testServiceError) GetOpcRequestID() string { + return "" +} From 266a7579d87cc31b726b47f054dc933a2d593f33 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 20 Nov 2025 21:59:32 -0800 Subject: [PATCH 02/13] remove untracked file --- .gitignore | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 7a3f22d52..4ab399a19 100644 --- a/.gitignore +++ b/.gitignore @@ -42,8 +42,4 @@ tilt-settings.json # git worktrees -worktrees/capoci-example-cluster.kubeconfig -capoci-example-cluster.yaml -env.sh -provider-config-example.yaml -capoci-example-cluster.kubeconfig +worktrees/ From 821aa45db9c872603e772c1e88b39b2a8952b79b Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Sat, 22 Nov 2025 19:45:11 -0800 Subject: [PATCH 03/13] cleanup --- .gitignore | 2 +- cloud/ociutil/ociutil_test.go | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 4ab399a19..3f29d9587 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,4 @@ tilt-settings.json # git worktrees -worktrees/ +worktrees/ \ No newline at end of file diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index 0cd9a567b..dee7f5a57 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -143,6 +143,4 @@ func (f fakeServiceError) GetCode() string { return f.code } -func (f fakeServiceError) GetOpcRequestID() string { - return "" -} +func (f fakeServiceError) GetOpcRequestID() string { return "" } From a7aa01f5341d0d552c386bf0a3d2fe7c1d414735 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Mon, 1 Dec 2025 09:40:16 -0800 Subject: [PATCH 04/13] cleanup --- cloud/scope/machine.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 4ac743ee5..759693808 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -305,8 +305,7 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() - // Build the list of availability/fault domain combinations we will try - // when launching the instance (primary FD first, then fallbacks). + // Build the list of availability/fault domain to attempt launch in faultDomains := m.buildFaultDomainLaunchAttempts(availabilityDomain, faultDomain) return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) } @@ -373,8 +372,7 @@ func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initia return attempts } - // Prefer fault domains exposed via the Cluster status. This respects - // Cluster API's scheduling decisions before falling back to raw OCI data. + // Build faultDomainAttempt list failureDomains := m.OCIClusterAccessor.GetFailureDomains() if len(failureDomains) > 0 { keys := make([]string, 0, len(failureDomains)) @@ -400,8 +398,7 @@ func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initia } // If the cluster status didn't enumerate any additional fault domains, - // fall back to the cached availability-domain data gathered from OCI so we - // can still iterate through every physical fault domain in that AD. + // fall back to every physical fault domain in that AD. if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { if adEntry, ok := adMap[availabilityDomain]; ok { for _, fd := range adEntry.FaultDomains { From dfe0740cbc1ad61d1b62e7bf2347ab565c6acd89 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Tue, 2 Dec 2025 21:37:39 -0800 Subject: [PATCH 05/13] simplify logic --- cloud/ociutil/ociutil.go | 33 +------ cloud/ociutil/ociutil_test.go | 37 ++----- cloud/scope/machine.go | 177 +++++++++++++--------------------- cloud/scope/machine_test.go | 68 ++++--------- 4 files changed, 99 insertions(+), 216 deletions(-) diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index 93e85403b..2f7dc01c2 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -42,6 +42,7 @@ const ( CreatedBy = "CreatedBy" OCIClusterAPIProvider = "OCIClusterAPIProvider" ClusterResourceIdentifier = "ClusterResourceIdentifier" + OutOfHostCapacityErr = "Out of host capacity" ) // ErrNotFound is for simulation during testing, OCI SDK does not have a way @@ -173,38 +174,12 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { return tags } -// NOTE: Currently we only key off the documented "Out of host capacity" error. -// If OCI starts surfacing additional codes/messages we can expand this list. -// Reference: https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/troubleshooting-out-of-host-capacity.htm -var outOfCapacityErrorCodes = map[string]struct{}{ - "OUTOFHOSTCAPACITY": {}, -} - -var outOfCapacityErrorMessages = []string{ - "out of host capacity", -} - -// IsOutOfHostCapacityError returns true when the OCI service error indicates that the fault domain ran out of capacity. -func IsOutOfHostCapacityError(err error) bool { +// IsOutOfHostCapacity returns true when the error message indicates that the fault domain ran out of capacity. +func IsOutOfHostCapacity(err error) bool { if err == nil { return false } - err = errors.Cause(err) - serviceErr, ok := common.IsServiceError(err) - if !ok { - return false - } - code := serviceErr.GetCode() - if _, found := outOfCapacityErrorCodes[strings.ToUpper(code)]; found { - return true - } - message := strings.ToLower(serviceErr.GetMessage()) - for _, fragment := range outOfCapacityErrorMessages { - if strings.Contains(message, fragment) { - return true - } - } - return false + return strings.Contains(strings.ToLower(err.Error()), strings.ToLower(OutOfHostCapacityErr)) } // DerefString returns the string value if the pointer isn't nil, otherwise returns empty string diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index dee7f5a57..7db2a0a52 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -84,24 +84,24 @@ func TestAddToDefaultClusterTags(t *testing.T) { } } -func TestIsOutOfHostCapacityError(t *testing.T) { +func TestIsOutOfHostCapacity(t *testing.T) { testCases := []struct { name string err error expected bool }{ { - name: "matches by code", - err: fakeServiceError{code: "OutOfHostCapacity", message: "any"}, + name: "matches exact message", + err: fmt.Errorf(OutOfHostCapacityErr), expected: true, }, { - name: "matches by message", - err: fakeServiceError{code: "Other", message: "Instance launch failed due to out of host capacity"}, + name: "matches substring", + err: fmt.Errorf("Instance launch failed due to %s in chosen fd", OutOfHostCapacityErr), expected: true, }, { - name: "non service error", + name: "non matching message", err: fmt.Errorf("boom"), expected: false, }, @@ -114,33 +114,10 @@ func TestIsOutOfHostCapacityError(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - actual := IsOutOfHostCapacityError(tc.err) + actual := IsOutOfHostCapacity(tc.err) if actual != tc.expected { t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name) } }) } } - -type fakeServiceError struct { - code string - message string -} - -func (f fakeServiceError) Error() string { - return f.message -} - -func (f fakeServiceError) GetHTTPStatusCode() int { - return 400 -} - -func (f fakeServiceError) GetMessage() string { - return f.message -} - -func (f fakeServiceError) GetCode() string { - return f.code -} - -func (f fakeServiceError) GetOpcRequestID() string { return "" } diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 759693808..4199854c7 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -23,7 +23,6 @@ import ( "fmt" "math/big" "net/url" - "sort" "strconv" "github.com/go-logr/logr" @@ -83,11 +82,6 @@ type MachineScope struct { WorkRequestsClient wr.Client } -type faultDomainAttempt struct { - AvailabilityDomain string - FaultDomain string -} - // NewMachineScope creates a MachineScope given the MachineScopeParams func NewMachineScope(params MachineScopeParams) (*MachineScope, error) { if params.Machine == nil { @@ -212,44 +206,9 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, } } - failureDomain := m.Machine.Spec.FailureDomain - var faultDomain string - var availabilityDomain string - if failureDomain != nil { - failureDomainIndex, err := strconv.Atoi(*failureDomain) - if err != nil { - m.Logger.Error(err, "Failure Domain is not a valid integer") - return nil, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") - } - m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) - if failureDomainIndex < 1 || failureDomainIndex > 3 { - err = errors.New("failure domain should be a value between 1 and 3") - m.Logger.Error(err, "Failure domain should be a value between 1 and 3") - return nil, err - } - fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] - if !exists { - err = errors.New("failure domain not found in cluster failure domains") - m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) - return nil, err - } - faultDomain = fd.Attributes[FaultDomain] - availabilityDomain = fd.Attributes[AvailabilityDomain] - } else { - randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) - if err != nil { - m.Logger.Error(err, "Failed to generate random failure domain") - return nil, err - } - // the random number generated is between zero and two, whereas we need a number between one and three - failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) - fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] - if !exists { - err = errors.New("failure domain not found in cluster failure domains") - m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) - return nil, err - } - availabilityDomain = fd.Attributes[AvailabilityDomain] + availabilityDomain, faultDomain, retry, err := m.resolveAvailabilityAndFaultDomain() + if err != nil { + return nil, err } metadata := m.OCIMachine.Spec.Metadata @@ -305,27 +264,21 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() - // Build the list of availability/fault domain to attempt launch in - faultDomains := m.buildFaultDomainLaunchAttempts(availabilityDomain, faultDomain) + // Build the list of fault domains to attempt launch in + faultDomains := m.buildFaultDomainLaunchList(availabilityDomain, faultDomain, retry) return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) } -func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, attempts []faultDomainAttempt) (*core.Instance, error) { - if len(attempts) == 0 { - attempts = append(attempts, faultDomainAttempt{ - AvailabilityDomain: ociutil.DerefString(baseDetails.AvailabilityDomain), - }) +func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, faultDomains []string) (*core.Instance, error) { + if len(faultDomains) == 0 { + faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} } - opcRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) var lastErr error - for idx, attempt := range attempts { + for idx, fd := range faultDomains { details := baseDetails - if attempt.AvailabilityDomain != "" { - details.AvailabilityDomain = common.String(attempt.AvailabilityDomain) - } - if attempt.FaultDomain != "" { - details.FaultDomain = common.String(attempt.FaultDomain) + if fd != "" { + details.FaultDomain = common.String(fd) } else { details.FaultDomain = nil } @@ -337,77 +290,81 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b if err == nil { return &resp.Instance, nil } - lastAttempt := idx == len(attempts)-1 - if !ociutil.IsOutOfHostCapacityError(err) || lastAttempt { + lastAttempt := idx == len(faultDomains)-1 + if !ociutil.IsOutOfHostCapacity(err) || lastAttempt { return nil, err } lastErr = err - m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", attempt.FaultDomain) + m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", fd) } return nil, lastErr } -const defaultFaultDomainKey = "__no_fault_domain__" - -func (m *MachineScope) buildFaultDomainLaunchAttempts(availabilityDomain, initialFaultDomain string) []faultDomainAttempt { - var attempts []faultDomainAttempt - seen := make(map[string]bool) - addAttempt := func(fd string) { - key := fd - if fd == "" { - key = defaultFaultDomainKey - } - if seen[key] { - return - } - seen[key] = true - attempts = append(attempts, faultDomainAttempt{ - AvailabilityDomain: availabilityDomain, - FaultDomain: fd, - }) - } - - addAttempt(initialFaultDomain) - if availabilityDomain == "" { +func (m *MachineScope) buildFaultDomainLaunchList(availabilityDomain, initialFaultDomain string, retryAcrossFaultDomains bool) []string { + attempts := []string{initialFaultDomain} + if !retryAcrossFaultDomains || availabilityDomain == "" { return attempts } - // Build faultDomainAttempt list - failureDomains := m.OCIClusterAccessor.GetFailureDomains() - if len(failureDomains) > 0 { - keys := make([]string, 0, len(failureDomains)) - for key := range failureDomains { - keys = append(keys, key) - } - sort.Strings(keys) - for _, key := range keys { - spec := failureDomains[key] - if spec.Attributes[AvailabilityDomain] != availabilityDomain { - continue - } - fd := spec.Attributes[FaultDomain] - if fd == "" { - continue - } - addAttempt(fd) + for _, fd := range m.faultDomainsForAvailabilityDomain(availabilityDomain) { + if fd == initialFaultDomain { + continue } + attempts = append(attempts, fd) } - if len(attempts) > 1 { - return attempts - } + return attempts +} - // If the cluster status didn't enumerate any additional fault domains, - // fall back to every physical fault domain in that AD. +func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string { if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { if adEntry, ok := adMap[availabilityDomain]; ok { - for _, fd := range adEntry.FaultDomains { - addAttempt(fd) - } + return append([]string(nil), adEntry.FaultDomains...) } } + return nil +} - return attempts +func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) { + failureDomainKey := m.Machine.Spec.FailureDomain + if failureDomainKey == nil { + randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) + if err != nil { + m.Logger.Error(err, "Failed to generate random failure domain") + return "", "", false, err + } + failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) + } + + fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] + if !exists { + err := errors.New("failure domain not found in cluster failure domains") + m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey) + return "", "", false, err + } + availabilityDomain := fdEntry.Attributes[AvailabilityDomain] + faultDomain := fdEntry.Attributes[FaultDomain] + originalFaultDomain := faultDomain + + // If fault domain is not specified, pick a random one from the availability domain + // if faultDomain == "" { + // fds := m.faultDomainsForAvailabilityDomain(availabilityDomain) + // if len(fds) == 0 { + // err := errors.New("no fault domains discovered for availability domain") + // m.Logger.Error(err, "Fault domain discovery failed", "availability-domain", availabilityDomain) + // return "", "", false, err + // } + + // randomIndex, err := rand.Int(rand.Reader, big.NewInt(int64(len(fds)))) + // if err != nil { + // m.Logger.Error(err, "Failed to generate random fallback fault domain") + // return "", "", false, err + // } + // faultDomain = fds[randomIndex.Int64()] + // } + + m.Logger.Info("Failure Domain being used", "failure-domain", *failureDomainKey) + return availabilityDomain, faultDomain, originalFaultDomain != "", nil } func (m *MachineScope) getFreeFormTags() map[string]string { diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index 1593386f2..9b637d8e6 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -401,8 +401,10 @@ func TestInstanceReconciliation(t *testing.T) { }, }, { - name: "retries fault domains discovered from availability domain cache", - errorExpected: false, + name: "does not retry when fault domain is derived from availability domain cache", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity"), testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { setupAllParams(ms) ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster @@ -416,7 +418,7 @@ func TestInstanceReconciliation(t *testing.T) { ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ "ad1": { Name: "ad1", - FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2"}, + FaultDomains: []string{"FAULT-DOMAIN-1"}, }, } ms.Machine.Spec.FailureDomain = common.String("1") @@ -425,23 +427,9 @@ func TestInstanceReconciliation(t *testing.T) { CompartmentId: common.String("test"), })).Return(core.ListInstancesResponse{}, nil) - first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "") - })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) - - second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) - - third := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") - })).Return(core.LaunchInstanceResponse{ - Instance: core.Instance{ - Id: common.String("instance-id"), - }, - }, nil) - - gomock.InOrder(first, second, third) }, }, { @@ -3058,6 +3046,20 @@ func setupAllParams(ms *MachineScope) { }, }, } + ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ + "ad1": { + Name: "ad1", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + "ad2": { + Name: "ad2", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + "ad3": { + Name: "ad3", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + } ms.Machine.Spec.FailureDomain = common.String("2") ociCluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta2.Subnet{ { @@ -3070,34 +3072,6 @@ func setupAllParams(ms *MachineScope) { ms.OCIMachine.UID = "machineuid" } -type testServiceError struct { - code string - message string -} - func newOutOfCapacityServiceError(message string) error { - return testServiceError{ - code: "OutOfHostCapacity", - message: message, - } -} - -func (t testServiceError) Error() string { - return t.message -} - -func (t testServiceError) GetHTTPStatusCode() int { - return 400 -} - -func (t testServiceError) GetMessage() string { - return t.message -} - -func (t testServiceError) GetCode() string { - return t.code -} - -func (t testServiceError) GetOpcRequestID() string { - return "" + return fmt.Errorf("%s: %s", ociutil.OutOfHostCapacityErr, message) } From cf707a10f03436a20ebe7617e07979441bebbc67 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Wed, 3 Dec 2025 09:22:25 -0800 Subject: [PATCH 06/13] revise unit test --- cloud/scope/machine.go | 42 +++++++++++++-------------------- cloud/scope/machine_test.go | 47 ++++--------------------------------- 2 files changed, 20 insertions(+), 69 deletions(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 4199854c7..c0faf602c 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -279,10 +279,7 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b details := baseDetails if fd != "" { details.FaultDomain = common.String(fd) - } else { - details.FaultDomain = nil } - resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ LaunchInstanceDetails: details, OpcRetryToken: opcRetryToken, @@ -300,9 +297,9 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b return nil, lastErr } -func (m *MachineScope) buildFaultDomainLaunchList(availabilityDomain, initialFaultDomain string, retryAcrossFaultDomains bool) []string { +func (m *MachineScope) buildFaultDomainLaunchList(availabilityDomain, initialFaultDomain string, retry bool) []string { attempts := []string{initialFaultDomain} - if !retryAcrossFaultDomains || availabilityDomain == "" { + if !retry || availabilityDomain == "" { return attempts } @@ -336,6 +333,18 @@ func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) } + failureDomainIndex, err := strconv.Atoi(*failureDomainKey) + if err != nil { + m.Logger.Error(err, "Failure Domain is not a valid integer") + return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") + } + if failureDomainIndex < 1 || failureDomainIndex > 3 { + err = errors.New("failure domain should be a value between 1 and 3") + m.Logger.Error(err, "Failure domain should be a value between 1 and 3") + return "", "", false, err + } + m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) + fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] if !exists { err := errors.New("failure domain not found in cluster failure domains") @@ -344,27 +353,8 @@ func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool } availabilityDomain := fdEntry.Attributes[AvailabilityDomain] faultDomain := fdEntry.Attributes[FaultDomain] - originalFaultDomain := faultDomain - - // If fault domain is not specified, pick a random one from the availability domain - // if faultDomain == "" { - // fds := m.faultDomainsForAvailabilityDomain(availabilityDomain) - // if len(fds) == 0 { - // err := errors.New("no fault domains discovered for availability domain") - // m.Logger.Error(err, "Fault domain discovery failed", "availability-domain", availabilityDomain) - // return "", "", false, err - // } - - // randomIndex, err := rand.Int(rand.Reader, big.NewInt(int64(len(fds)))) - // if err != nil { - // m.Logger.Error(err, "Failed to generate random fallback fault domain") - // return "", "", false, err - // } - // faultDomain = fds[randomIndex.Int64()] - // } - - m.Logger.Info("Failure Domain being used", "failure-domain", *failureDomainKey) - return availabilityDomain, faultDomain, originalFaultDomain != "", nil + + return availabilityDomain, faultDomain, faultDomain != "", nil } func (m *MachineScope) getFreeFormTags() map[string]string { diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index 9b637d8e6..8f26f36c0 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -321,7 +321,7 @@ func TestInstanceReconciliation(t *testing.T) { }, }, { - name: "retries alternate fault domains on capacity errors", + name: "retries other fault domains when out of capacity error is returned and initial fault domain is set", errorExpected: false, testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { setupAllParams(ms) @@ -362,46 +362,7 @@ func TestInstanceReconciliation(t *testing.T) { }, }, { - name: "returns error after exhausting all fault domains", - errorExpected: true, - errorSubStringMatch: true, - matchError: errors.New("out of host capacity"), - testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { - setupAllParams(ms) - ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster - ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ - "1": { - Attributes: map[string]string{ - "AvailabilityDomain": "ad1", - "FaultDomain": "FAULT-DOMAIN-1", - }, - }, - "2": { - Attributes: map[string]string{ - "AvailabilityDomain": "ad1", - "FaultDomain": "FAULT-DOMAIN-2", - }, - }, - } - ms.Machine.Spec.FailureDomain = common.String("1") - computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ - DisplayName: common.String("name"), - CompartmentId: common.String("test"), - })).Return(core.ListInstancesResponse{}, nil) - - first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") - })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) - - second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") - })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd2")) - - gomock.InOrder(first, second) - }, - }, - { - name: "does not retry when fault domain is derived from availability domain cache", + name: "does not retry when initial fault domain is empty", errorExpected: true, errorSubStringMatch: true, matchError: errors.New("out of host capacity"), @@ -418,7 +379,7 @@ func TestInstanceReconciliation(t *testing.T) { ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ "ad1": { Name: "ad1", - FaultDomains: []string{"FAULT-DOMAIN-1"}, + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2"}, }, } ms.Machine.Spec.FailureDomain = common.String("1") @@ -428,7 +389,7 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.ListInstancesResponse{}, nil) computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + return faultDomainRequestMatcher(request, "") })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) }, }, From 2be43101d570b8609a092cbe75f26da4504fe677 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 11 Dec 2025 00:01:32 -0600 Subject: [PATCH 07/13] use different token when retrying --- cloud/scope/machine.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index c0faf602c..468c5282f 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -273,17 +273,27 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b if len(faultDomains) == 0 { faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} } - opcRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) + baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) var lastErr error for idx, fd := range faultDomains { details := baseDetails if fd != "" { details.FaultDomain = common.String(fd) } + + // Derive a fault-domain-specific retry token to avoid + // requests being rejected as duplicates. + tokenVal := ociutil.DerefString(baseRetryToken) + if fd != "" { + tokenVal = tokenVal + "-" + fd + } + retryToken := common.String(tokenVal) + resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ LaunchInstanceDetails: details, - OpcRetryToken: opcRetryToken, + OpcRetryToken: retryToken, }) + if err == nil { return &resp.Instance, nil } From 61acc5fe3922644a47ae73f6c10f6a1927db6c1b Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 11 Dec 2025 00:58:52 -0600 Subject: [PATCH 08/13] add more test cases --- cloud/scope/machine_test.go | 117 +++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 3 deletions(-) diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index 8f26f36c0..932e03b64 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -347,11 +347,11 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.ListInstancesResponse{}, nil) first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-1") + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "FAULT-DOMAIN-2") + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") })).Return(core.LaunchInstanceResponse{ Instance: core.Instance{ Id: common.String("instance-id"), @@ -361,6 +361,99 @@ func TestInstanceReconciliation(t *testing.T) { gomock.InOrder(first, second) }, }, + { + name: "returns error after exhausting all fault domains", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity in fd3"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + "3": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-3", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd1")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd2")) + + third := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-3") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd3")) + + gomock.InOrder(first, second, third) + }, + }, + { + name: "stops retrying when non capacity error occurs", + errorExpected: true, + matchError: errors.New("quota exceeded"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + "3": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-3", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, errors.New("quota exceeded")) + + gomock.InOrder(first, second) + }, + }, { name: "does not retry when initial fault domain is empty", errorExpected: true, @@ -389,7 +482,7 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.ListInstancesResponse{}, nil) computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { - return faultDomainRequestMatcher(request, "") + return faultDomainRequestWithRetryTokenMatcher(request, "") })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) }, }, @@ -1580,6 +1673,24 @@ func faultDomainRequestMatcher(request interface{}, matchStr string) error { return nil } +func faultDomainRequestWithRetryTokenMatcher(request interface{}, matchStr string) error { + if err := faultDomainRequestMatcher(request, matchStr); err != nil { + return err + } + r := request.(core.LaunchInstanceRequest) + expectedToken := "machineuid" + if matchStr != "" { + expectedToken = expectedToken + "-" + matchStr + } + if r.OpcRetryToken == nil { + return errors.New("opc retry token was nil") + } + if *r.OpcRetryToken != expectedToken { + return errors.New(fmt.Sprintf("expecting opc retry token %s but got %s", expectedToken, *r.OpcRetryToken)) + } + return nil +} + func platformConfigMatcher(actual interface{}, expected core.PlatformConfig) error { r, ok := actual.(core.LaunchInstanceRequest) if !ok { From ec48b129157906b96e9157f8c7ad59c298503432 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 11 Dec 2025 00:59:48 -0600 Subject: [PATCH 09/13] rename function --- cloud/scope/machine.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 468c5282f..63b466f0d 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -265,7 +265,7 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() // Build the list of fault domains to attempt launch in - faultDomains := m.buildFaultDomainLaunchList(availabilityDomain, faultDomain, retry) + faultDomains := m.buildFaultDomainRetryList(availabilityDomain, faultDomain, retry) return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) } @@ -307,7 +307,7 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b return nil, lastErr } -func (m *MachineScope) buildFaultDomainLaunchList(availabilityDomain, initialFaultDomain string, retry bool) []string { +func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string { attempts := []string{initialFaultDomain} if !retry || availabilityDomain == "" { return attempts From 0660d8a086f276440976403400a144266624e202 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 11 Dec 2025 09:59:56 -0600 Subject: [PATCH 10/13] add logger fro retry --- cloud/scope/machine.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 63b466f0d..7878bc349 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -269,12 +269,17 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) } +// launchInstanceWithFaultDomainRetry iterates through the provided fault domains and +// attempts to launch an instance in each until one succeeds, all fail, or a non-capacity error occurs. func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, faultDomains []string) (*core.Instance, error) { if len(faultDomains) == 0 { faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} } + baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) var lastErr error + totalAttempts := len(faultDomains) + for idx, fd := range faultDomains { details := baseDetails if fd != "" { @@ -289,6 +294,7 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b } retryToken := common.String(tokenVal) + m.Logger.Info("Attempting instance launch", "faultDomain", fd, "attemptNumber", idx+1, "totalAttempts", totalAttempts) resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ LaunchInstanceDetails: details, OpcRetryToken: retryToken, @@ -302,11 +308,17 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b return nil, err } lastErr = err - m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "faultDomain", fd) + nextFD := "" + if !lastAttempt { + nextFD = faultDomains[idx+1] + } + m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "nextFaultDomain", nextFD) } return nil, lastErr } +// buildFaultDomainRetryList returns the list of fault domains we should attempt +// when launching in the given availability domain. func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string { attempts := []string{initialFaultDomain} if !retry || availabilityDomain == "" { @@ -323,6 +335,7 @@ func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaul return attempts } +// faultDomainsForAvailabilityDomain returns the fault domains for the availability domain. func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string { if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { if adEntry, ok := adMap[availabilityDomain]; ok { From e04ffb6d3942b83f47fe19118cf072d0c68a8fcf Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Thu, 11 Dec 2025 14:07:27 -0600 Subject: [PATCH 11/13] refine --- cloud/scope/machine.go | 46 ++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 7878bc349..a6b0354bc 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -276,6 +276,10 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} } + if m.OCIMachine == nil { + return nil, errors.New("machine scope is missing OCIMachine") + } + baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) var lastErr error totalAttempts := len(faultDomains) @@ -317,25 +321,25 @@ func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, b return nil, lastErr } -// buildFaultDomainRetryList returns the list of fault domains we should attempt +// buildFaultDomainRetryList returns the list of fault domains we should try // when launching in the given availability domain. func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string { - attempts := []string{initialFaultDomain} + faultDomainList := []string{initialFaultDomain} if !retry || availabilityDomain == "" { - return attempts + return faultDomainList } for _, fd := range m.faultDomainsForAvailabilityDomain(availabilityDomain) { if fd == initialFaultDomain { continue } - attempts = append(attempts, fd) + faultDomainList = append(faultDomainList, fd) } - return attempts + return faultDomainList } -// faultDomainsForAvailabilityDomain returns the fault domains for the availability domain. +// faultDomainsForAvailabilityDomain returns the fault domains for the given availability domain. func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string { if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { if adEntry, ok := adMap[availabilityDomain]; ok { @@ -346,32 +350,34 @@ func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain stri } func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) { - failureDomainKey := m.Machine.Spec.FailureDomain - if failureDomainKey == nil { + faultDomainKey := m.Machine.Spec.FailureDomain + + // CAPI refers to these IDs as FailureDomains but in OCI they map to FaultDomains. + if faultDomainKey == nil { randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) if err != nil { - m.Logger.Error(err, "Failed to generate random failure domain") + m.Logger.Error(err, "Failed to generate random fault domain") return "", "", false, err } - failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) + faultDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) } - failureDomainIndex, err := strconv.Atoi(*failureDomainKey) + faultDomainIndex, err := strconv.Atoi(*faultDomainKey) if err != nil { - m.Logger.Error(err, "Failure Domain is not a valid integer") - return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") + m.Logger.Error(err, "Fault domain is not a valid integer") + return "", "", false, errors.Wrap(err, "invalid fault domain parameter, must be a valid integer") } - if failureDomainIndex < 1 || failureDomainIndex > 3 { - err = errors.New("failure domain should be a value between 1 and 3") - m.Logger.Error(err, "Failure domain should be a value between 1 and 3") + if faultDomainIndex < 1 || faultDomainIndex > 3 { + err = errors.New("fault domain should be a value between 1 and 3") + m.Logger.Error(err, "Fault domain should be a value between 1 and 3") return "", "", false, err } - m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) + m.Logger.Info("Fault Domain being used", "fault-domain", faultDomainIndex) - fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] + fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*faultDomainKey] if !exists { - err := errors.New("failure domain not found in cluster failure domains") - m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey) + err := errors.New("fault domain not found in cluster failure domains") + m.Logger.Error(err, "Fault domain not found", "fault-domain", *faultDomainKey) return "", "", false, err } availabilityDomain := fdEntry.Attributes[AvailabilityDomain] From b839a8729b7bf7229836df0c87cb1d0e95e19580 Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Fri, 12 Dec 2025 14:21:59 -0600 Subject: [PATCH 12/13] check capacity error with error code --- cloud/ociutil/ociutil.go | 9 ++++++-- cloud/ociutil/ociutil_test.go | 41 ++++++++++++++++++++++++++++++----- cloud/scope/machine.go | 30 ++++++++++++------------- cloud/scope/machine_test.go | 33 +++++++++++++++++++++++++++- 4 files changed, 89 insertions(+), 24 deletions(-) diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index 2f7dc01c2..fe9c59c06 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -174,12 +174,17 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { return tags } -// IsOutOfHostCapacity returns true when the error message indicates that the fault domain ran out of capacity. +// IsOutOfHostCapacity returns true when the OCI service error indicates that the fault domain ran out of capacity. func IsOutOfHostCapacity(err error) bool { if err == nil { return false } - return strings.Contains(strings.ToLower(err.Error()), strings.ToLower(OutOfHostCapacityErr)) + err = errors.Cause(err) + if serviceErr, ok := common.IsServiceError(err); ok { + return serviceErr.GetHTTPStatusCode() == http.StatusInternalServerError && + strings.Contains(strings.ToLower(serviceErr.GetMessage()), strings.ToLower(OutOfHostCapacityErr)) + } + return false } // DerefString returns the string value if the pointer isn't nil, otherwise returns empty string diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index 7db2a0a52..eb97cf8f4 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -18,6 +18,7 @@ package ociutil import ( "fmt" + "net/http" "reflect" "testing" @@ -91,14 +92,19 @@ func TestIsOutOfHostCapacity(t *testing.T) { expected bool }{ { - name: "matches exact message", - err: fmt.Errorf(OutOfHostCapacityErr), + name: "service error matches internal code and message", + err: fakeServiceError{status: http.StatusInternalServerError, message: OutOfHostCapacityErr}, expected: true, }, { - name: "matches substring", - err: fmt.Errorf("Instance launch failed due to %s in chosen fd", OutOfHostCapacityErr), - expected: true, + name: "non matching service error status", + err: fakeServiceError{status: http.StatusBadRequest, message: OutOfHostCapacityErr}, + expected: false, + }, + { + name: "non matching service error message", + err: fakeServiceError{status: http.StatusInternalServerError, message: "other"}, + expected: false, }, { name: "non matching message", @@ -121,3 +127,28 @@ func TestIsOutOfHostCapacity(t *testing.T) { }) } } + +type fakeServiceError struct { + status int + message string +} + +func (f fakeServiceError) Error() string { + return f.message +} + +func (f fakeServiceError) GetHTTPStatusCode() int { + return f.status +} + +func (f fakeServiceError) GetMessage() string { + return f.message +} + +func (f fakeServiceError) GetCode() string { + return "" +} + +func (f fakeServiceError) GetOpcRequestID() string { + return "" +} diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index a6b0354bc..fcd65e08d 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -350,34 +350,32 @@ func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain stri } func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) { - faultDomainKey := m.Machine.Spec.FailureDomain - - // CAPI refers to these IDs as FailureDomains but in OCI they map to FaultDomains. - if faultDomainKey == nil { + failureDomainKey := m.Machine.Spec.FailureDomain + if failureDomainKey == nil { randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) if err != nil { - m.Logger.Error(err, "Failed to generate random fault domain") + m.Logger.Error(err, "Failed to generate random failure domain") return "", "", false, err } - faultDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) + failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) } - faultDomainIndex, err := strconv.Atoi(*faultDomainKey) + failureDomainIndex, err := strconv.Atoi(*failureDomainKey) if err != nil { - m.Logger.Error(err, "Fault domain is not a valid integer") - return "", "", false, errors.Wrap(err, "invalid fault domain parameter, must be a valid integer") + m.Logger.Error(err, "Failure Domain is not a valid integer") + return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") } - if faultDomainIndex < 1 || faultDomainIndex > 3 { - err = errors.New("fault domain should be a value between 1 and 3") - m.Logger.Error(err, "Fault domain should be a value between 1 and 3") + if failureDomainIndex < 1 || failureDomainIndex > 3 { + err = errors.New("failure domain should be a value between 1 and 3") + m.Logger.Error(err, "Failure domain should be a value between 1 and 3") return "", "", false, err } - m.Logger.Info("Fault Domain being used", "fault-domain", faultDomainIndex) + m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) - fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*faultDomainKey] + fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] if !exists { - err := errors.New("fault domain not found in cluster failure domains") - m.Logger.Error(err, "Fault domain not found", "fault-domain", *faultDomainKey) + err := errors.New("failure domain not found in cluster failure domains") + m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey) return "", "", false, err } availabilityDomain := fdEntry.Attributes[AvailabilityDomain] diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index 932e03b64..fc208d63e 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -20,6 +20,7 @@ import ( "context" "encoding/base64" "fmt" + "net/http" "reflect" "testing" @@ -3144,6 +3145,36 @@ func setupAllParams(ms *MachineScope) { ms.OCIMachine.UID = "machineuid" } +type testServiceError struct { + status int + code string + message string +} + +func (t testServiceError) Error() string { + return t.message +} + +func (t testServiceError) GetHTTPStatusCode() int { + return t.status +} + +func (t testServiceError) GetCode() string { + return t.code +} + +func (t testServiceError) GetMessage() string { + return t.message +} + +func (t testServiceError) GetOpcRequestID() string { + return "" +} + func newOutOfCapacityServiceError(message string) error { - return fmt.Errorf("%s: %s", ociutil.OutOfHostCapacityErr, message) + return testServiceError{ + status: http.StatusInternalServerError, + code: "InternalError", + message: fmt.Sprintf("%s: %s", ociutil.OutOfHostCapacityErr, message), + } } From 8d1d6afc3898809a3b79fd4b7b444fc57ee86beb Mon Sep 17 00:00:00 2001 From: Ethan Liu Date: Fri, 12 Dec 2025 17:58:16 -0600 Subject: [PATCH 13/13] add error reference --- cloud/ociutil/ociutil.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index fe9c59c06..0f12efcc6 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -175,6 +175,7 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { } // IsOutOfHostCapacity returns true when the OCI service error indicates that the fault domain ran out of capacity. +// Error code: https://docs.public.content.oci.oraclecloud.com/en-us/iaas/Content/Compute/known-issues.htm?utm_source=chatgpt.com#out-of-host-capacity-error-when-creating-compute-instances:~:text=Tag%20Defaults.-,Out%20of%20host%20capacity%20error%20when%20creating%20compute%20instances,-%F0%9F%94%97 func IsOutOfHostCapacity(err error) bool { if err == nil { return false