-
Notifications
You must be signed in to change notification settings - Fork 40
feat: Add Fault Domain retry logic for out of host capacity error #473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9d3eb62
266a757
821aa45
88316d1
a7aa01f
95c3a39
dfe0740
cf707a1
2be4310
61acc5f
ec48b12
0660d8a
e04ffb6
b839a87
8d1d6af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -206,44 +206,9 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, | |
| } | ||
| } | ||
|
|
||
| failureDomain := m.Machine.Spec.FailureDomain | ||
| var faultDomain string | ||
| var availabilityDomain string | ||
| if failureDomain != nil { | ||
| failureDomainIndex, err := strconv.Atoi(*failureDomain) | ||
| if err != nil { | ||
| m.Logger.Error(err, "Failure Domain is not a valid integer") | ||
| return nil, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") | ||
| } | ||
| m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) | ||
| if failureDomainIndex < 1 || failureDomainIndex > 3 { | ||
| err = errors.New("failure domain should be a value between 1 and 3") | ||
| m.Logger.Error(err, "Failure domain should be a value between 1 and 3") | ||
| return nil, err | ||
| } | ||
| fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] | ||
| if !exists { | ||
| err = errors.New("failure domain not found in cluster failure domains") | ||
| m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) | ||
| return nil, err | ||
| } | ||
| faultDomain = fd.Attributes[FaultDomain] | ||
| availabilityDomain = fd.Attributes[AvailabilityDomain] | ||
| } else { | ||
| randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) | ||
| if err != nil { | ||
| m.Logger.Error(err, "Failed to generate random failure domain") | ||
| return nil, err | ||
| } | ||
| // the random number generated is between zero and two, whereas we need a number between one and three | ||
| failureDomain = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) | ||
| fd, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomain] | ||
| if !exists { | ||
| err = errors.New("failure domain not found in cluster failure domains") | ||
| m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomain) | ||
| return nil, err | ||
| } | ||
| availabilityDomain = fd.Attributes[AvailabilityDomain] | ||
| availabilityDomain, faultDomain, retry, err := m.resolveAvailabilityAndFaultDomain() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| metadata := m.OCIMachine.Spec.Metadata | ||
|
|
@@ -299,14 +264,124 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance, | |
| launchDetails.PreemptibleInstanceConfig = m.getPreemptibleInstanceConfig() | ||
| launchDetails.PlatformConfig = m.getPlatformConfig() | ||
| launchDetails.LaunchVolumeAttachments = m.getLaunchVolumeAttachments() | ||
| req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails, | ||
| OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))} | ||
| resp, err := m.ComputeClient.LaunchInstance(ctx, req) | ||
| // Build the list of fault domains to attempt launch in | ||
| faultDomains := m.buildFaultDomainRetryList(availabilityDomain, faultDomain, retry) | ||
| return m.launchInstanceWithFaultDomainRetry(ctx, launchDetails, faultDomains) | ||
| } | ||
|
|
||
| // launchInstanceWithFaultDomainRetry iterates through the provided fault domains and | ||
| // attempts to launch an instance in each until one succeeds, all fail, or a non-capacity error occurs. | ||
| func (m *MachineScope) launchInstanceWithFaultDomainRetry(ctx context.Context, baseDetails core.LaunchInstanceDetails, faultDomains []string) (*core.Instance, error) { | ||
| if len(faultDomains) == 0 { | ||
| faultDomains = []string{ociutil.DerefString(baseDetails.FaultDomain)} | ||
| } | ||
|
|
||
| if m.OCIMachine == nil { | ||
| return nil, errors.New("machine scope is missing OCIMachine") | ||
| } | ||
|
|
||
| baseRetryToken := ociutil.GetOPCRetryToken(string(m.OCIMachine.UID)) | ||
| var lastErr error | ||
| totalAttempts := len(faultDomains) | ||
|
|
||
| for idx, fd := range faultDomains { | ||
| details := baseDetails | ||
| if fd != "" { | ||
| details.FaultDomain = common.String(fd) | ||
| } | ||
|
|
||
| // Derive a fault-domain-specific retry token to avoid | ||
| // requests being rejected as duplicates. | ||
| tokenVal := ociutil.DerefString(baseRetryToken) | ||
| if fd != "" { | ||
| tokenVal = tokenVal + "-" + fd | ||
| } | ||
| retryToken := common.String(tokenVal) | ||
|
|
||
| m.Logger.Info("Attempting instance launch", "faultDomain", fd, "attemptNumber", idx+1, "totalAttempts", totalAttempts) | ||
| resp, err := m.ComputeClient.LaunchInstance(ctx, core.LaunchInstanceRequest{ | ||
| LaunchInstanceDetails: details, | ||
| OpcRetryToken: retryToken, | ||
| }) | ||
|
|
||
| if err == nil { | ||
| return &resp.Instance, nil | ||
| } | ||
| lastAttempt := idx == len(faultDomains)-1 | ||
| if !ociutil.IsOutOfHostCapacity(err) || lastAttempt { | ||
| return nil, err | ||
| } | ||
| lastErr = err | ||
| nextFD := "" | ||
| if !lastAttempt { | ||
| nextFD = faultDomains[idx+1] | ||
| } | ||
| m.Logger.Info("Fault domain has run out of host capacity, retrying in a different domain", "nextFaultDomain", nextFD) | ||
| } | ||
| return nil, lastErr | ||
| } | ||
|
|
||
| // buildFaultDomainRetryList returns the list of fault domains we should try | ||
| // when launching in the given availability domain. | ||
| func (m *MachineScope) buildFaultDomainRetryList(availabilityDomain, initialFaultDomain string, retry bool) []string { | ||
| faultDomainList := []string{initialFaultDomain} | ||
| if !retry || availabilityDomain == "" { | ||
| return faultDomainList | ||
| } | ||
|
|
||
| for _, fd := range m.faultDomainsForAvailabilityDomain(availabilityDomain) { | ||
| if fd == initialFaultDomain { | ||
| continue | ||
| } | ||
| faultDomainList = append(faultDomainList, fd) | ||
| } | ||
|
|
||
| return faultDomainList | ||
| } | ||
|
|
||
| // faultDomainsForAvailabilityDomain returns the fault domains for the given availability domain. | ||
| func (m *MachineScope) faultDomainsForAvailabilityDomain(availabilityDomain string) []string { | ||
| if adMap := m.OCIClusterAccessor.GetAvailabilityDomains(); adMap != nil { | ||
| if adEntry, ok := adMap[availabilityDomain]; ok { | ||
| return append([]string(nil), adEntry.FaultDomains...) | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (m *MachineScope) resolveAvailabilityAndFaultDomain() (string, string, bool, error) { | ||
| failureDomainKey := m.Machine.Spec.FailureDomain | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets stick with calling them FaultDomains here since that is what the function and all the new code calls it. Maybe even make a comment here that CAPI calls them Failure Domains, but OCI calls them FaultDomains
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could keep using
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is fine then lets rename |
||
| if failureDomainKey == nil { | ||
| randomFailureDomain, err := rand.Int(rand.Reader, big.NewInt(3)) | ||
| if err != nil { | ||
| m.Logger.Error(err, "Failed to generate random failure domain") | ||
| return "", "", false, err | ||
| } | ||
| failureDomainKey = common.String(strconv.Itoa(int(randomFailureDomain.Int64()) + 1)) | ||
| } | ||
|
|
||
| failureDomainIndex, err := strconv.Atoi(*failureDomainKey) | ||
| if err != nil { | ||
| return nil, err | ||
| } else { | ||
| return &resp.Instance, nil | ||
| m.Logger.Error(err, "Failure Domain is not a valid integer") | ||
| return "", "", false, errors.Wrap(err, "invalid failure domain parameter, must be a valid integer") | ||
| } | ||
| if failureDomainIndex < 1 || failureDomainIndex > 3 { | ||
| err = errors.New("failure domain should be a value between 1 and 3") | ||
| m.Logger.Error(err, "Failure domain should be a value between 1 and 3") | ||
| return "", "", false, err | ||
| } | ||
| m.Logger.Info("Failure Domain being used", "failure-domain", failureDomainIndex) | ||
|
|
||
| fdEntry, exists := m.OCIClusterAccessor.GetFailureDomains()[*failureDomainKey] | ||
| if !exists { | ||
| err := errors.New("failure domain not found in cluster failure domains") | ||
| m.Logger.Error(err, "Failure domain not found", "failure-domain", *failureDomainKey) | ||
| return "", "", false, err | ||
| } | ||
| availabilityDomain := fdEntry.Attributes[AvailabilityDomain] | ||
| faultDomain := fdEntry.Attributes[FaultDomain] | ||
|
|
||
| return availabilityDomain, faultDomain, faultDomain != "", nil | ||
| } | ||
|
|
||
| func (m *MachineScope) getFreeFormTags() map[string]string { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets add GoDoc style comments to the functions.