From 33d5a02821d2054cdda8a9090326084effd68255 Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 4 Dec 2025 14:45:34 +0100 Subject: [PATCH 1/4] feat: add support to use custom scale errors --- .../functions/control-plane/src/aws/runners.d.ts | 1 + .../functions/control-plane/src/aws/runners.test.ts | 13 +++++++++++++ lambdas/functions/control-plane/src/aws/runners.ts | 7 ++++++- lambdas/functions/control-plane/src/pool/pool.ts | 4 ++++ .../src/scale-runners/scale-up.test.ts | 12 ++++++++++++ .../control-plane/src/scale-runners/scale-up.ts | 5 +++++ main.tf | 1 + modules/multi-runner/runners.tf | 1 + modules/multi-runner/variables.tf | 2 ++ modules/runners/pool.tf | 1 + modules/runners/pool/main.tf | 1 + modules/runners/pool/variables.tf | 1 + modules/runners/scale-up.tf | 1 + modules/runners/variables.tf | 6 ++++++ variables.tf | 6 ++++++ 15 files changed, 61 insertions(+), 1 deletion(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index 72ff9e3e1a..80f91f5cb4 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -44,4 +44,5 @@ export interface RunnerInputParameters { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index a02f62cd36..ab47754f3c 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -418,6 +418,17 @@ describe('create runner with errors', () => { expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); }); + it('test ScaleError with custom scale error.', async () => { + createFleetMockWithErrors(['CustomAWSError']); + + await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError); + expect(mockEC2Client).toHaveReceivedCommandWith( + CreateFleetCommand, + expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + ); + expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); + }); + it('test ScaleError with multiple error.', async () => { createFleetMockWithErrors(['UnfulfillableCapacity', 'SomeError']); @@ -638,6 +649,7 @@ interface RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { @@ -657,6 +669,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName, tracingEnabled: runnerConfig.tracingEnabled, onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError, + customScaleErrors: runnerConfig.customScaleErrors, }; } diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 6779dd39d2..f32ec422d9 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -177,7 +177,7 @@ async function processFleetResult( // Educated guess of errors that would make sense to retry based on the list // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html - const scaleErrors = [ + const defaultScaleErrors = [ 'UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException', @@ -188,6 +188,11 @@ async function processFleetResult( 'InsufficientInstanceCapacity', ]; + const scaleErrors = + runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0 + ? runnerParameters.customScaleErrors + : defaultScaleErrors; + if ( errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) && runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot' diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index 07477572ce..ccc53916ed 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise { const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) : []; + const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS + ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string]) + : []; const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl(); @@ -96,6 +99,7 @@ export async function adjust(event: PoolEvent): Promise { amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, + customScaleErrors }, githubInstallationClient, ); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 477ef147fb..365983d918 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -98,6 +98,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { subnets: ['subnet-123'], tracingEnabled: false, onDemandFailoverOnError: [], + customScaleErrors: [], }; let expectedRunnerParams: RunnerInputParameters; @@ -115,6 +116,7 @@ function setDefaults() { process.env.INSTANCE_TYPES = 'm5.large'; process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot'; process.env.ENABLE_ON_DEMAND_FAILOVER = undefined; + process.env.CUSTOM_SCALE_ERRORS = undefined; } beforeEach(() => { @@ -587,6 +589,16 @@ describe('scaleUp with public GH', () => { }); }); + it('creates a runner with correct config and labels and custom scale errors enabled.', async () => { + process.env.RUNNER_LABELS = 'label1,label2'; + process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']); + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); + expect(createRunner).toBeCalledWith({ + ...expectedRunnerParams, + customScaleErrors: ['RequestLimitExceeded'], + }); + }); + it('creates a runner and ensure the group argument is ignored', async () => { process.env.RUNNER_LABELS = 'label1,label2'; process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED'; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index 638edd3232..a95b79f756 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -60,6 +60,7 @@ interface CreateEC2RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; + customScaleErrors?: string[]; } function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { @@ -251,6 +252,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) : []; + const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS + ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string]) + : []; if (ephemeralEnabled && payload.eventType !== 'workflow_job') { logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`); @@ -335,6 +339,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, + customScaleErrors, }, githubInstallationClient, ); diff --git a/main.tf b/main.tf index 69a2a5a82d..50c6267792 100644 --- a/main.tf +++ b/main.tf @@ -191,6 +191,7 @@ module "runners" { enable_jit_config = var.enable_jit_config enable_job_queued_check = var.enable_job_queued_check enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors + ccustom_scale_errors = var.custom_scale_errors disable_runner_autoupdate = var.disable_runner_autoupdate enable_managed_runner_security_group = var.enable_managed_runner_security_group enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf index 811ab36260..840e51eb10 100644 --- a/modules/multi-runner/runners.tf +++ b/modules/multi-runner/runners.tf @@ -35,6 +35,7 @@ module "runners" { github_app_parameters = local.github_app_parameters ebs_optimized = each.value.runner_config.ebs_optimized enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors + custom_scale_errors = each.value.runner_config.custom_scale_errors enable_organization_runners = each.value.runner_config.enable_organization_runners enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners enable_jit_config = each.value.runner_config.enable_jit_config diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index 0cf8607c09..a5bcc87a7d 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -84,6 +84,7 @@ variable "multi_runner_config" { enable_ephemeral_runners = optional(bool, false) enable_job_queued_check = optional(bool, null) enable_on_demand_failover_for_errors = optional(list(string), []) + custom_scale_errors = optional(list(string), []) enable_organization_runners = optional(bool, false) enable_runner_binaries_syncer = optional(bool, true) enable_ssm_on_runners = optional(bool, false) @@ -193,6 +194,7 @@ variable "multi_runner_config" { enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once." enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners." enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later." + custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up." enable_organization_runners: "Register runners to organization, instead of repo level" enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI." enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances." diff --git a/modules/runners/pool.tf b/modules/runners/pool.tf index 2762008ebf..58ee9fd1bb 100644 --- a/modules/runners/pool.tf +++ b/modules/runners/pool.tf @@ -42,6 +42,7 @@ module "pool" { ephemeral = var.enable_ephemeral_runners enable_jit_config = var.enable_jit_config enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors + custom_scale_errors = var.custom_scale_errors boot_time_in_minutes = var.runner_boot_time_in_minutes labels = var.runner_labels launch_template = aws_launch_template.runner diff --git a/modules/runners/pool/main.tf b/modules/runners/pool/main.tf index e141b22d25..403ccb675a 100644 --- a/modules/runners/pool/main.tf +++ b/modules/runners/pool/main.tf @@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" { POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.config.runner.enable_on_demand_failover_for_errors) + CUSTOM_SCALE_ERRORS = jsonencode(var.config.runner.custom_scale_errors) } } diff --git a/modules/runners/pool/variables.tf b/modules/runners/pool/variables.tf index f1e841cde6..b3a237e24d 100644 --- a/modules/runners/pool/variables.tf +++ b/modules/runners/pool/variables.tf @@ -32,6 +32,7 @@ variable "config" { ephemeral = bool enable_jit_config = bool enable_on_demand_failover_for_errors = list(string) + custom_scale_errors = list(string) boot_time_in_minutes = number labels = list(string) launch_template = object({ diff --git a/modules/runners/scale-up.tf b/modules/runners/scale-up.tf index 89d95a50d0..640a23c03b 100644 --- a/modules/runners/scale-up.tf +++ b/modules/runners/scale-up.tf @@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" { SSM_CONFIG_PATH = "${var.ssm_paths.root}/${var.ssm_paths.config}" SUBNET_IDS = join(",", var.subnet_ids) ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.enable_on_demand_failover_for_errors) + CUSTOM_SCALE_ERRORS = jsonencode(var.custom_scale_errors) JOB_RETRY_CONFIG = jsonencode(local.job_retry_config) } } diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index 856014564c..b2695bdb12 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -713,6 +713,12 @@ variable "enable_on_demand_failover_for_errors" { default = [] } +variable "custom_scale_errors" { + description = "List of custom aws error codes that should trigger retry during scale up." + type = list(string) + default = [] +} + variable "lambda_tags" { description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags." type = map(string) diff --git a/variables.tf b/variables.tf index 6d6a895873..55d9706458 100644 --- a/variables.tf +++ b/variables.tf @@ -283,6 +283,12 @@ variable "enable_runner_on_demand_failover_for_errors" { default = [] } +variable "custom_scale_errors" { + description = "List of custom aws error codes that should trigger retry during scale up." + type = list(string) + default = [] +} + variable "enable_userdata" { description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI." type = bool From 4072a3de428cb484c108e37a3df548136089685e Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 4 Dec 2025 14:49:47 +0100 Subject: [PATCH 2/4] chore: update variable description --- modules/multi-runner/variables.tf | 2 +- modules/runners/variables.tf | 2 +- variables.tf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index a5bcc87a7d..a4280c2fbd 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -194,7 +194,7 @@ variable "multi_runner_config" { enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once." enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners." enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later." - custom_scale_errors: "List of custom aws error codes that should trigger retry during scale up." + custom_scale_errors: "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" enable_organization_runners: "Register runners to organization, instead of repo level" enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI." enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances." diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index b2695bdb12..a912eb757f 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -714,7 +714,7 @@ variable "enable_on_demand_failover_for_errors" { } variable "custom_scale_errors" { - description = "List of custom aws error codes that should trigger retry during scale up." + description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" type = list(string) default = [] } diff --git a/variables.tf b/variables.tf index 55d9706458..18646387a6 100644 --- a/variables.tf +++ b/variables.tf @@ -284,7 +284,7 @@ variable "enable_runner_on_demand_failover_for_errors" { } variable "custom_scale_errors" { - description = "List of custom aws error codes that should trigger retry during scale up." + description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts" type = list(string) default = [] } From 6c6f85cc2d6aa16e0ff85d24a429c33c2f79abe9 Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 4 Dec 2025 14:56:34 +0100 Subject: [PATCH 3/4] fix: fix typo --- main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.tf b/main.tf index 50c6267792..131de913c6 100644 --- a/main.tf +++ b/main.tf @@ -191,7 +191,7 @@ module "runners" { enable_jit_config = var.enable_jit_config enable_job_queued_check = var.enable_job_queued_check enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors - ccustom_scale_errors = var.custom_scale_errors + custom_scale_errors = var.custom_scale_errors disable_runner_autoupdate = var.disable_runner_autoupdate enable_managed_runner_security_group = var.enable_managed_runner_security_group enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring From 68f1f4b371f90272a33bca38e891b0e966e48ca1 Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 4 Dec 2025 15:03:34 +0100 Subject: [PATCH 4/4] style: fix ts formating --- lambdas/functions/control-plane/src/aws/runners.test.ts | 6 ++++-- lambdas/functions/control-plane/src/pool/pool.ts | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index ab47754f3c..73eaca6895 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -420,8 +420,10 @@ describe('create runner with errors', () => { it('test ScaleError with custom scale error.', async () => { createFleetMockWithErrors(['CustomAWSError']); - - await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError); + + await expect( + createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] })), + ).rejects.toBeInstanceOf(ScaleError); expect(mockEC2Client).toHaveReceivedCommandWith( CreateFleetCommand, expectedCreateFleetRequest(defaultExpectedFleetRequestValues), diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index ccc53916ed..91eb515adf 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -99,7 +99,7 @@ export async function adjust(event: PoolEvent): Promise { amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, - customScaleErrors + customScaleErrors, }, githubInstallationClient, );