diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 72ff9e3e1a..80f91f5cb4 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -44,4 +44,5 @@ export interface RunnerInputParameters { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index a02f62cd36..73eaca6895 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -418,6 +418,19 @@ describe('create runner with errors', () => { expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); }); + it('test ScaleError with custom scale error.', async () => { + createFleetMockWithErrors(['CustomAWSError']); + + await expect( + createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] })), + ).rejects.toBeInstanceOf(ScaleError); + expect(mockEC2Client).toHaveReceivedCommandWith( + CreateFleetCommand, + expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + ); + expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); + }); + it('test ScaleError with multiple error.', async () => { createFleetMockWithErrors(['UnfulfillableCapacity', 'SomeError']); @@ -638,6 +651,7 @@ interface RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { @@ -657,6 +671,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName, tracingEnabled: runnerConfig.tracingEnabled, onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError, + customScaleErrors: runnerConfig.customScaleErrors, }; } diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 6779dd39d2..f32ec422d9 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -177,7 +177,7 @@ async function processFleetResult( // Educated guess of errors that would make sense to retry based on the list // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html - const scaleErrors = [ + const defaultScaleErrors = [ 'UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException', @@ -188,6 +188,11 @@ async function processFleetResult( 'InsufficientInstanceCapacity', ]; + const scaleErrors = + runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0 + ? runnerParameters.customScaleErrors + : defaultScaleErrors; + if ( errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) && runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot' diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index 07477572ce..91eb515adf 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise { const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) : []; + const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS + ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string]) + : []; const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl(); @@ -96,6 +99,7 @@ export async function adjust(event: PoolEvent): Promise { amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, + customScaleErrors, }, githubInstallationClient, ); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 477ef147fb..365983d918 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -98,6 +98,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { subnets: ['subnet-123'], tracingEnabled: false, onDemandFailoverOnError: [], + customScaleErrors: [], }; let expectedRunnerParams: RunnerInputParameters; @@ -115,6 +116,7 @@ function setDefaults() { process.env.INSTANCE_TYPES = 'm5.large'; process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot'; process.env.ENABLE_ON_DEMAND_FAILOVER = undefined; + process.env.CUSTOM_SCALE_ERRORS = undefined; } beforeEach(() => { @@ -587,6 +589,16 @@ describe('scaleUp with public GH', () => { }); }); + it('creates a runner with correct config and labels and custom scale errors enabled.', async () => { + process.env.RUNNER_LABELS = 'label1,label2'; + process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']); + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + expect(createRunner).toBeCalledWith({ + ...expectedRunnerParams, + customScaleErrors: ['RequestLimitExceeded'], + }); + }); + it('creates a runner and ensure the group argument is ignored', async () => { process.env.RUNNER_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 638edd3232..a95b79f756 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -60,6 +60,7 @@ interface CreateEC2RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { @@ -251,6 +252,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) : []; + const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS + ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string]) + : []; if (ephemeralEnabled && payload.eventType !== 'workflow_job') { logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); @@ -335,6 +339,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, + customScaleErrors, }, githubInstallationClient, ); diff --git a/main.tf b/main.tf index 69a2a5a82d..131de913c6 100644 --- a/main.tf +++ b/main.tf @@ -191,6 +191,7 @@ module "runners" { enable_jit_config = var.enable_jit_config enable_job_queued_check = var.enable_job_queued_check enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors + custom_scale_errors = var.custom_scale_errors disable_runner_autoupdate = var.disable_runner_autoupdate enable_managed_runner_security_group = var.enable_managed_runner_security_group enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf index 811ab36260..840e51eb10 100644 --- a/modules/multi-runner/runners.tf +++ b/modules/multi-runner/runners.tf @@ -35,6 +35,7 @@ module "runners" { github_app_parameters = local.github_app_parameters ebs_optimized = each.value.runner_config.ebs_optimized enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors + custom_scale_errors = each.value.runner_config.custom_scale_errors enable_organization_runners = each.value.runner_config.enable_organization_runners enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners enable_jit_config = each.value.runner_config.enable_jit_config diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index 0cf8607c09..a4280c2fbd 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -84,6 +84,7 @@ variable "multi_runner_config" { enable_ephemeral_runners = optional(bool, false) enable_job_queued_check = optional(bool, null) enable_on_demand_failover_for_errors = optional(list(string), []) + custom_scale_errors = optional(list(string), []) enable_organization_runners = optional(bool, false) enable_runner_binaries_syncer = optional(bool, true) enable_ssm_on_runners = optional(bool, false) @@ -193,6 +194,7 @@ variable "multi_runner_config" { enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once." enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners." enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later." + custom_scale_errors: "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" enable_organization_runners: "Register runners to organization, instead of repo level" enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI." enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances." diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf index 2762008ebf..58ee9fd1bb 100644 --- a/modules/runners/pool.tf +++ b/modules/runners/pool.tf @@ -42,6 +42,7 @@ module "pool" { ephemeral = var.enable_ephemeral_runners enable_jit_config = var.enable_jit_config enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors + custom_scale_errors = var.custom_scale_errors boot_time_in_minutes = var.runner_boot_time_in_minutes labels = var.runner_labels launch_template = aws_launch_template.runner diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf index e141b22d25..403ccb675a 100644 --- a/modules/runners/pool/main.tf +++ b/modules/runners/pool/main.tf @@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" { POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.config.runner.enable_on_demand_failover_for_errors) + CUSTOM_SCALE_ERRORS = jsonencode(var.config.runner.custom_scale_errors) } } diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf index f1e841cde6..b3a237e24d 100644 --- a/modules/runners/pool/variables.tf +++ b/modules/runners/pool/variables.tf @@ -32,6 +32,7 @@ variable "config" { ephemeral = bool enable_jit_config = bool enable_on_demand_failover_for_errors = list(string) + custom_scale_errors = list(string) boot_time_in_minutes = number labels = list(string) launch_template = object({ diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 89d95a50d0..640a23c03b 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" { SSM_CONFIG_PATH = "${var.ssm_paths.root}/${var.ssm_paths.config}" SUBNET_IDS = join(",", var.subnet_ids) ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.enable_on_demand_failover_for_errors) + CUSTOM_SCALE_ERRORS = jsonencode(var.custom_scale_errors) JOB_RETRY_CONFIG = jsonencode(local.job_retry_config) } } diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index 856014564c..a912eb757f 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -713,6 +713,12 @@ variable "enable_on_demand_failover_for_errors" { default = [] } +variable "custom_scale_errors" { + description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" + type = list(string) + default = [] +} + variable "lambda_tags" { description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags." type = map(string) diff --git a/variables.tf b/variables.tf index 6d6a895873..18646387a6 100644 --- a/variables.tf +++ b/variables.tf @@ -283,6 +283,12 @@ variable "enable_runner_on_demand_failover_for_errors" { default = [] } +variable "custom_scale_errors" { + description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" + type = list(string) + default = [] +} + variable "enable_userdata" { description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI." type = bool