Skip to content
Open
16 changes: 16 additions & 0 deletions cloud/ociutil/ociutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"net/http"
"strings"
"time"

lb "github.com/oracle/cluster-api-provider-oci/cloud/services/loadbalancer"
Expand All @@ -41,6 +42,7 @@ const (
CreatedBy = "CreatedBy"
OCIClusterAPIProvider = "OCIClusterAPIProvider"
ClusterResourceIdentifier = "ClusterResourceIdentifier"
OutOfHostCapacityErr = "Out of host capacity"
)

// ErrNotFound is for simulation during testing, OCI SDK does not have a way
Expand Down Expand Up @@ -172,6 +174,20 @@ func BuildClusterTags(ClusterResourceUID string) map[string]string {
return tags
}

// IsOutOfHostCapacity returns true when the OCI service error indicates that the fault domain ran out of capacity.
// Error code: https://docs.public.content.oci.oraclecloud.com/en-us/iaas/Content/Compute/known-issues.htm?utm_source=chatgpt.com#out-of-host-capacity-error-when-creating-compute-instances:~:text=Tag%20Defaults.-,Out%20of%20host%20capacity%20error%20when%20creating%20compute%20instances,-%F0%9F%94%97
func IsOutOfHostCapacity(err error) bool {
if err == nil {
return false
}
err = errors.Cause(err)
if serviceErr, ok := common.IsServiceError(err); ok {
return serviceErr.GetHTTPStatusCode() == http.StatusInternalServerError &&
strings.Contains(strings.ToLower(serviceErr.GetMessage()), strings.ToLower(OutOfHostCapacityErr))
}
return false
}

// DerefString returns the string value if the pointer isn't nil, otherwise returns empty string
func DerefString(s *string) string {
if s != nil {
Expand Down
69 changes: 69 additions & 0 deletions cloud/ociutil/ociutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package ociutil

import (
"fmt"
"net/http"
"reflect"
"testing"

Expand Down Expand Up @@ -83,3 +84,71 @@ func TestAddToDefaultClusterTags(t *testing.T) {
}
}
}

func TestIsOutOfHostCapacity(t *testing.T) {
testCases := []struct {
name string
err error
expected bool
}{
{
name: "service error matches internal code and message",
err: fakeServiceError{status: http.StatusInternalServerError, message: OutOfHostCapacityErr},
expected: true,
},
{
name: "non matching service error status",
err: fakeServiceError{status: http.StatusBadRequest, message: OutOfHostCapacityErr},
expected: false,
},
{
name: "non matching service error message",
err: fakeServiceError{status: http.StatusInternalServerError, message: "other"},
expected: false,
},
{
name: "non matching message",
err: fmt.Errorf("boom"),
expected: false,
},
{
name: "nil error",
err: nil,
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
actual := IsOutOfHostCapacity(tc.err)
if actual != tc.expected {
t.Fatalf("expected %t but got %t for test %s", tc.expected, actual, tc.name)
}
})
}
}

type fakeServiceError struct {
status int
message string
}

func (f fakeServiceError) Error() string {
return f.message
}

func (f fakeServiceError) GetHTTPStatusCode() int {
return f.status
}

func (f fakeServiceError) GetMessage() string {
return f.message
}

func (f fakeServiceError) GetCode() string {
return ""
}

func (f fakeServiceError) GetOpcRequestID() string {
return ""
}
163 changes: 119 additions & 44 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,44 +206,9 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
}
}

failureDomain := m.Machine.Spec.FailureDomain
var faultDomain string
var availabilityDomain string
if failureDomain != nil {
failureDomainIndex, err := strconv.Atoi(*failureDomain)
if err != nil {
m.Logger.Error(err, "Failure Domain is not a valid integer")
return nil, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer")
}
m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex)
if failureDomainIndex < 1 || failureDomainIndex > 3 {
err = errors.New("failure domain should be a value between 1 and 3")
m.Logger.Error(err, "Failure domain should be a value between 1 and 3")
return nil, err
}
fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain]
if !exists {
err = errors.New("failure domain not found in cluster failure domains")
m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain)
return nil, err
}
faultDomain = fd.Attributes[FaultDomain]
availabilityDomain = fd.Attributes[AvailabilityDomain]
} else {
randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3))
if err != nil {
m.Logger.Error(err, "Failed to generate random failure domain")
return nil, err
}
// the random number generated is between zero and two, whereas we need a number between one and three
failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1))
fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain]
if !exists {
err = errors.New("failure domain not found in cluster failure domains")
m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain)
return nil, err
}
availabilityDomain = fd.Attributes[AvailabilityDomain]
availabilityDomain, faultDomain, retry, err := m.resolveAvailabilityAndFaultDomain()
if err != nil {
return nil, err
}

metadata := m.OCIMachine.Spec.Metadata
Expand Down Expand Up @@ -299,14 +264,124 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig()
launchDetails.PlatformConfig = m.getPlatformConfig()
launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments()
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
resp, err := m.ComputeClient.LaunchInstance(ctx, req)
// Build the list of fault domains to attempt launch in
faultDomains := m.buildFaultDomainRetryList(availabilityDomain, faultDomain, retry)
return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains)
}

// launchInstanceWithFaultDomainRetry iterates through the provided fault domains and
// attempts to launch an instance in each until one succeeds, all fail, or a non-capacity error occurs.
func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, faultDomains []string) (*core.Instance, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets add GoDoc style comments to the functions.

if len(faultDomains) == 0 {
faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)}
}

if m.OCIMachine == nil {
return nil, errors.New("machine scope is missing OCIMachine")
}

baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))
var lastErr error
totalAttempts := len(faultDomains)

for idx, fd := range faultDomains {
details := baseDetails
if fd != "" {
details.FaultDomain = common.String(fd)
}

// Derive a fault-domain-specific retry token to avoid
// requests being rejected as duplicates.
tokenVal := ociutil.DerefString(baseRetryToken)
if fd != "" {
tokenVal = tokenVal + "-" + fd
}
retryToken := common.String(tokenVal)

m.Logger.Info("Attempting instance launch", "faultDomain", fd, "attemptNumber", idx+1, "totalAttempts", totalAttempts)
resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{
LaunchInstanceDetails: details,
OpcRetryToken: retryToken,
})

if err == nil {
return &resp.Instance, nil
}
lastAttempt := idx == len(faultDomains)-1
if !ociutil.IsOutOfHostCapacity(err) || lastAttempt {
return nil, err
}
lastErr = err
nextFD := ""
if !lastAttempt {
nextFD = faultDomains[idx+1]
}
m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "nextFaultDomain", nextFD)
}
return nil, lastErr
}

// buildFaultDomainRetryList returns the list of fault domains we should try
// when launching in the given availability domain.
func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string {
faultDomainList := []string{initialFaultDomain}
if !retry || availabilityDomain == "" {
return faultDomainList
}

for _, fd := range m.faultDomainsForAvailabilityDomain(availabilityDomain) {
if fd == initialFaultDomain {
continue
}
faultDomainList = append(faultDomainList, fd)
}

return faultDomainList
}

// faultDomainsForAvailabilityDomain returns the fault domains for the given availability domain.
func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string {
if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil {
if adEntry, ok := adMap[availabilityDomain]; ok {
return append([]string(nil), adEntry.FaultDomains...)
}
}
return nil
}

func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) {
failureDomainKey := m.Machine.Spec.FailureDomain
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets stick with calling them FaultDomains here since that is what the function and all the new code calls it. Maybe even make a comment here that CAPI calls them Failure Domains, but OCI calls them FaultDomains

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could keep using Failure Domain as it contains AD/FD info. In case of single AD regions, the failure domain will be fault domain, in case of multi Ad regions, it will be AD. So we just get what we need from Failure domain. Not sure if this make sense.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is fine then lets rename resolveAvailabilityAndFaultDomain to stick with FailureDomain. I just want to be as clear as we can be.

if failureDomainKey == nil {
randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3))
if err != nil {
m.Logger.Error(err, "Failed to generate random failure domain")
return "", "", false, err
}
failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1))
}

failureDomainIndex, err := strconv.Atoi(*failureDomainKey)
if err != nil {
return nil, err
} else {
return &resp.Instance, nil
m.Logger.Error(err, "Failure Domain is not a valid integer")
return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer")
}
if failureDomainIndex < 1 || failureDomainIndex > 3 {
err = errors.New("failure domain should be a value between 1 and 3")
m.Logger.Error(err, "Failure domain should be a value between 1 and 3")
return "", "", false, err
}
m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex)

fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey]
if !exists {
err := errors.New("failure domain not found in cluster failure domains")
m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey)
return "", "", false, err
}
availabilityDomain := fdEntry.Attributes[AvailabilityDomain]
faultDomain := fdEntry.Attributes[FaultDomain]

return availabilityDomain, faultDomain, faultDomain != "", nil
}

func (m *MachineScope) getFreeFormTags() map[string]string {
Expand Down
Loading