diff --git a/cloud/ociutil/ociutil.go b/cloud/ociutil/ociutil.go index 93976190..0f12efcc 100644 --- a/cloud/ociutil/ociutil.go +++ b/cloud/ociutil/ociutil.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "net/http" + "strings" "time" lb "github.com/oracle/cluster-api-provider-oci/cloud/services/loadbalancer" @@ -41,6 +42,7 @@ const ( CreatedBy = "CreatedBy" OCIClusterAPIProvider = "OCIClusterAPIProvider" ClusterResourceIdentifier = "ClusterResourceIdentifier" + OutOfHostCapacityErr = "Out of host capacity" ) // ErrNotFound is for simulation during testing, OCI SDK does not have a way @@ -172,6 +174,20 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string { return tags } +// IsOutOfHostCapacity returns true when the OCI service error indicates that the fault domain ran out of capacity. +// Error code: https://docs.public.content.oci.oraclecloud.com/en-us/iaas/Content/Compute/known-issues.htm?utm_source=chatgpt.com#out-of-host-capacity-error-when-creating-compute-instances:~:text=Tag%20Defaults.-,Out%20of%20host%20capacity%20error%20when%20creating%20compute%20instances,-%F0%9F%94%97 +func IsOutOfHostCapacity(err error) bool { + if err == nil { + return false + } + err = errors.Cause(err) + if serviceErr, ok := common.IsServiceError(err); ok { + return serviceErr.GetHTTPStatusCode() == http.StatusInternalServerError && + strings.Contains(strings.ToLower(serviceErr.GetMessage()), strings.ToLower(OutOfHostCapacityErr)) + } + return false +} + // DerefString returns the string value if the pointer isn't nil, otherwise returns empty string func DerefString(s *string) string { if s != nil { diff --git a/cloud/ociutil/ociutil_test.go b/cloud/ociutil/ociutil_test.go index 37449c0b..eb97cf8f 100644 --- a/cloud/ociutil/ociutil_test.go +++ b/cloud/ociutil/ociutil_test.go @@ -18,6 +18,7 @@ package ociutil import ( "fmt" + "net/http" "reflect" "testing" @@ -83,3 +84,71 @@ func TestAddToDefaultClusterTags(t *testing.T) { } } } + +func TestIsOutOfHostCapacity(t *testing.T) { + testCases := []struct { + name string + err error + expected bool + }{ + { + name: "service error matches internal code and message", + err: fakeServiceError{status: http.StatusInternalServerError, message: OutOfHostCapacityErr}, + expected: true, + }, + { + name: "non matching service error status", + err: fakeServiceError{status: http.StatusBadRequest, message: OutOfHostCapacityErr}, + expected: false, + }, + { + name: "non matching service error message", + err: fakeServiceError{status: http.StatusInternalServerError, message: "other"}, + expected: false, + }, + { + name: "non matching message", + err: fmt.Errorf("boom"), + expected: false, + }, + { + name: "nil error", + err: nil, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual := IsOutOfHostCapacity(tc.err) + if actual != tc.expected { + t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name) + } + }) + } +} + +type fakeServiceError struct { + status int + message string +} + +func (f fakeServiceError) Error() string { + return f.message +} + +func (f fakeServiceError) GetHTTPStatusCode() int { + return f.status +} + +func (f fakeServiceError) GetMessage() string { + return f.message +} + +func (f fakeServiceError) GetCode() string { + return "" +} + +func (f fakeServiceError) GetOpcRequestID() string { + return "" +} diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 37bb7e16..fcd65e08 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -206,44 +206,9 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, } } - failureDomain := m.Machine.Spec.FailureDomain - var faultDomain string - var availabilityDomain string - if failureDomain != nil { - failureDomainIndex, err := strconv.Atoi(*failureDomain) - if err != nil { - m.Logger.Error(err, "Failure Domain is not a valid integer") - return nil, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") - } - m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) - if failureDomainIndex < 1 || failureDomainIndex > 3 { - err = errors.New("failure domain should be a value between 1 and 3") - m.Logger.Error(err, "Failure domain should be a value between 1 and 3") - return nil, err - } - fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] - if !exists { - err = errors.New("failure domain not found in cluster failure domains") - m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) - return nil, err - } - faultDomain = fd.Attributes[FaultDomain] - availabilityDomain = fd.Attributes[AvailabilityDomain] - } else { - randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) - if err != nil { - m.Logger.Error(err, "Failed to generate random failure domain") - return nil, err - } - // the random number generated is between zero and two, whereas we need a number between one and three - failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) - fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] - if !exists { - err = errors.New("failure domain not found in cluster failure domains") - m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) - return nil, err - } - availabilityDomain = fd.Attributes[AvailabilityDomain] + availabilityDomain, faultDomain, retry, err := m.resolveAvailabilityAndFaultDomain() + if err != nil { + return nil, err } metadata := m.OCIMachine.Spec.Metadata @@ -299,14 +264,124 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() launchDetails.PlatformConfig = m.getPlatformConfig() launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() - req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails, - OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} - resp, err := m.ComputeClient.LaunchInstance(ctx, req) + // Build the list of fault domains to attempt launch in + faultDomains := m.buildFaultDomainRetryList(availabilityDomain, faultDomain, retry) + return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) +} + +// launchInstanceWithFaultDomainRetry iterates through the provided fault domains and +// attempts to launch an instance in each until one succeeds, all fail, or a non-capacity error occurs. +func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, faultDomains []string) (*core.Instance, error) { + if len(faultDomains) == 0 { + faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} + } + + if m.OCIMachine == nil { + return nil, errors.New("machine scope is missing OCIMachine") + } + + baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) + var lastErr error + totalAttempts := len(faultDomains) + + for idx, fd := range faultDomains { + details := baseDetails + if fd != "" { + details.FaultDomain = common.String(fd) + } + + // Derive a fault-domain-specific retry token to avoid + // requests being rejected as duplicates. + tokenVal := ociutil.DerefString(baseRetryToken) + if fd != "" { + tokenVal = tokenVal + "-" + fd + } + retryToken := common.String(tokenVal) + + m.Logger.Info("Attempting instance launch", "faultDomain", fd, "attemptNumber", idx+1, "totalAttempts", totalAttempts) + resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ + LaunchInstanceDetails: details, + OpcRetryToken: retryToken, + }) + + if err == nil { + return &resp.Instance, nil + } + lastAttempt := idx == len(faultDomains)-1 + if !ociutil.IsOutOfHostCapacity(err) || lastAttempt { + return nil, err + } + lastErr = err + nextFD := "" + if !lastAttempt { + nextFD = faultDomains[idx+1] + } + m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "nextFaultDomain", nextFD) + } + return nil, lastErr +} + +// buildFaultDomainRetryList returns the list of fault domains we should try +// when launching in the given availability domain. +func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string { + faultDomainList := []string{initialFaultDomain} + if !retry || availabilityDomain == "" { + return faultDomainList + } + + for _, fd := range m.faultDomainsForAvailabilityDomain(availabilityDomain) { + if fd == initialFaultDomain { + continue + } + faultDomainList = append(faultDomainList, fd) + } + + return faultDomainList +} + +// faultDomainsForAvailabilityDomain returns the fault domains for the given availability domain. +func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string { + if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { + if adEntry, ok := adMap[availabilityDomain]; ok { + return append([]string(nil), adEntry.FaultDomains...) + } + } + return nil +} + +func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) { + failureDomainKey := m.Machine.Spec.FailureDomain + if failureDomainKey == nil { + randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) + if err != nil { + m.Logger.Error(err, "Failed to generate random failure domain") + return "", "", false, err + } + failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) + } + + failureDomainIndex, err := strconv.Atoi(*failureDomainKey) if err != nil { - return nil, err - } else { - return &resp.Instance, nil + m.Logger.Error(err, "Failure Domain is not a valid integer") + return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") + } + if failureDomainIndex < 1 || failureDomainIndex > 3 { + err = errors.New("failure domain should be a value between 1 and 3") + m.Logger.Error(err, "Failure domain should be a value between 1 and 3") + return "", "", false, err } + m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) + + fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] + if !exists { + err := errors.New("failure domain not found in cluster failure domains") + m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey) + return "", "", false, err + } + availabilityDomain := fdEntry.Attributes[AvailabilityDomain] + faultDomain := fdEntry.Attributes[FaultDomain] + + return availabilityDomain, faultDomain, faultDomain != "", nil } func (m *MachineScope) getFreeFormTags() map[string]string { diff --git a/cloud/scope/machine_test.go b/cloud/scope/machine_test.go index c7086394..fc208d63 100644 --- a/cloud/scope/machine_test.go +++ b/cloud/scope/machine_test.go @@ -20,6 +20,7 @@ import ( "context" "encoding/base64" "fmt" + "net/http" "reflect" "testing" @@ -320,6 +321,172 @@ func TestInstanceReconciliation(t *testing.T) { })).Return(core.LaunchInstanceResponse{}, nil) }, }, + { + name: "retries other fault domains when out of capacity error is returned and initial fault domain is set", + errorExpected: false, + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{ + Instance: core.Instance{ + Id: common.String("instance-id"), + }, + }, nil) + + gomock.InOrder(first, second) + }, + }, + { + name: "returns error after exhausting all fault domains", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity in fd3"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + "3": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-3", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd1")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd2")) + + third := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-3") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity in fd3")) + + gomock.InOrder(first, second, third) + }, + }, + { + name: "stops retrying when non capacity error occurs", + errorExpected: true, + matchError: errors.New("quota exceeded"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-1", + }, + }, + "2": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-2", + }, + }, + "3": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + "FaultDomain": "FAULT-DOMAIN-3", + }, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + first := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-1") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + + second := computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "FAULT-DOMAIN-2") + })).Return(core.LaunchInstanceResponse{}, errors.New("quota exceeded")) + + gomock.InOrder(first, second) + }, + }, + { + name: "does not retry when initial fault domain is empty", + errorExpected: true, + errorSubStringMatch: true, + matchError: errors.New("out of host capacity"), + testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) { + setupAllParams(ms) + ociCluster := ms.OCIClusterAccessor.(OCISelfManagedCluster).OCICluster + ociCluster.Status.FailureDomains = map[string]clusterv1.FailureDomainSpec{ + "1": { + Attributes: map[string]string{ + "AvailabilityDomain": "ad1", + }, + }, + } + ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ + "ad1": { + Name: "ad1", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2"}, + }, + } + ms.Machine.Spec.FailureDomain = common.String("1") + computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{ + DisplayName: common.String("name"), + CompartmentId: common.String("test"), + })).Return(core.ListInstancesResponse{}, nil) + + computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error { + return faultDomainRequestWithRetryTokenMatcher(request, "") + })).Return(core.LaunchInstanceResponse{}, newOutOfCapacityServiceError("out of host capacity")) + }, + }, { name: "check all params together", errorExpected: false, @@ -1487,6 +1654,44 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error { return nil } +func faultDomainRequestMatcher(request interface{}, matchStr string) error { + r, ok := request.(core.LaunchInstanceRequest) + if !ok { + return errors.New("expecting LaunchInstanceRequest type") + } + if matchStr == "" { + if r.LaunchInstanceDetails.FaultDomain != nil { + return errors.New("expected fault domain to be nil") + } + return nil + } + if r.LaunchInstanceDetails.FaultDomain == nil { + return errors.New("fault domain was nil") + } + if *r.LaunchInstanceDetails.FaultDomain != matchStr { + return errors.New(fmt.Sprintf("expecting fault domain %s but got %s", matchStr, *r.LaunchInstanceDetails.FaultDomain)) + } + return nil +} + +func faultDomainRequestWithRetryTokenMatcher(request interface{}, matchStr string) error { + if err := faultDomainRequestMatcher(request, matchStr); err != nil { + return err + } + r := request.(core.LaunchInstanceRequest) + expectedToken := "machineuid" + if matchStr != "" { + expectedToken = expectedToken + "-" + matchStr + } + if r.OpcRetryToken == nil { + return errors.New("opc retry token was nil") + } + if *r.OpcRetryToken != expectedToken { + return errors.New(fmt.Sprintf("expecting opc retry token %s but got %s", expectedToken, *r.OpcRetryToken)) + } + return nil +} + func platformConfigMatcher(actual interface{}, expected core.PlatformConfig) error { r, ok := actual.(core.LaunchInstanceRequest) if !ok { @@ -2914,6 +3119,20 @@ func setupAllParams(ms *MachineScope) { }, }, } + ociCluster.Spec.AvailabilityDomains = map[string]infrastructurev1beta2.OCIAvailabilityDomain{ + "ad1": { + Name: "ad1", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + "ad2": { + Name: "ad2", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + "ad3": { + Name: "ad3", + FaultDomains: []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}, + }, + } ms.Machine.Spec.FailureDomain = common.String("2") ociCluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta2.Subnet{ { @@ -2925,3 +3144,37 @@ func setupAllParams(ms *MachineScope) { ociCluster.Spec.OCIResourceIdentifier = "resource_uid" ms.OCIMachine.UID = "machineuid" } + +type testServiceError struct { + status int + code string + message string +} + +func (t testServiceError) Error() string { + return t.message +} + +func (t testServiceError) GetHTTPStatusCode() int { + return t.status +} + +func (t testServiceError) GetCode() string { + return t.code +} + +func (t testServiceError) GetMessage() string { + return t.message +} + +func (t testServiceError) GetOpcRequestID() string { + return "" +} + +func newOutOfCapacityServiceError(message string) error { + return testServiceError{ + status: http.StatusInternalServerError, + code: "InternalError", + message: fmt.Sprintf("%s: %s", ociutil.OutOfHostCapacityErr, message), + } +}