github-aws-runners · edersonbrilhante · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
@@ -44,4 +44,5 @@ export interface RunnerInputParameters {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  customScaleErrors?: string[];
 }
@@ -418,6 +418,19 @@ describe('create runner with errors', () => {
     expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
   });
 
+  it('test ScaleError with custom scale error.', async () => {
+    createFleetMockWithErrors(['CustomAWSError']);
+
+    await expect(
+      createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] })),
+    ).rejects.toBeInstanceOf(ScaleError);
+    expect(mockEC2Client).toHaveReceivedCommandWith(
+      CreateFleetCommand,
+      expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
+    );
+    expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
+  });
+
   it('test ScaleError with multiple error.', async () => {
     createFleetMockWithErrors(['UnfulfillableCapacity', 'SomeError']);
 
@@ -638,6 +651,7 @@ interface RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  customScaleErrors?: string[];
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
@@ -657,6 +671,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
     tracingEnabled: runnerConfig.tracingEnabled,
     onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
+    customScaleErrors: runnerConfig.customScaleErrors,
   };
 }
 

@@ -177,7 +177,7 @@ async function processFleetResult(
 
     // Educated guess of errors that would make sense to retry based on the list
     // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
-    const scaleErrors = [
+    const defaultScaleErrors = [
       'UnfulfillableCapacity',
       'MaxSpotInstanceCountExceeded',
       'TargetCapacityLimitExceededException',
@@ -188,6 +188,11 @@ async function processFleetResult(
       'InsufficientInstanceCapacity',
     ];
 
+    const scaleErrors =
+      runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
+        ? runnerParameters.customScaleErrors
+        : defaultScaleErrors;
+
     if (
       errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
       runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'

@@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
+  const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
+    ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
+    : [];
 
   const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();
 
@@ -96,6 +99,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
         amiIdSsmParameterName,
         tracingEnabled,
         onDemandFailoverOnError,
+        customScaleErrors,
       },
       githubInstallationClient,
     );

@@ -98,6 +98,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
   subnets: ['subnet-123'],
   tracingEnabled: false,
   onDemandFailoverOnError: [],
+  customScaleErrors: [],
 };
 let expectedRunnerParams: RunnerInputParameters;
 
@@ -115,6 +116,7 @@ function setDefaults() {
   process.env.INSTANCE_TYPES = 'm5.large';
   process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
   process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
+  process.env.CUSTOM_SCALE_ERRORS = undefined;
 }
 
 beforeEach(() => {
@@ -587,6 +589,16 @@ describe('scaleUp with public GH', () => {
       });
     });
 
+    it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
+      process.env.RUNNER_LABELS = 'label1,label2';
+      process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
+      await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
+      expect(createRunner).toBeCalledWith({
+        ...expectedRunnerParams,
+        customScaleErrors: ['RequestLimitExceeded'],
+      });
+    });
+
     it('creates a runner and ensure the group argument is ignored', async () => {
       process.env.RUNNER_LABELS = 'label1,label2';
       process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';

@@ -60,6 +60,7 @@ interface CreateEC2RunnerConfig {
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
   onDemandFailoverOnError?: string[];
+  customScaleErrors?: string[];
 }
 
 function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
@@ -251,6 +252,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
   const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
     ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
     : [];
+  const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
+    ? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
+    : [];
 
   if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
     logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`);
@@ -335,6 +339,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
           amiIdSsmParameterName,
           tracingEnabled,
           onDemandFailoverOnError,
+          customScaleErrors,
         },
         githubInstallationClient,
       );

@@ -191,6 +191,7 @@ module "runners" {
   enable_jit_config                    = var.enable_jit_config
   enable_job_queued_check              = var.enable_job_queued_check
   enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
+  custom_scale_errors                  = var.custom_scale_errors
   disable_runner_autoupdate            = var.disable_runner_autoupdate
   enable_managed_runner_security_group = var.enable_managed_runner_security_group
   enable_runner_detailed_monitoring    = var.enable_runner_detailed_monitoring

@@ -35,6 +35,7 @@ module "runners" {
   github_app_parameters                = local.github_app_parameters
   ebs_optimized                        = each.value.runner_config.ebs_optimized
   enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
+  custom_scale_errors                  = each.value.runner_config.custom_scale_errors
   enable_organization_runners          = each.value.runner_config.enable_organization_runners
   enable_ephemeral_runners             = each.value.runner_config.enable_ephemeral_runners
   enable_jit_config                    = each.value.runner_config.enable_jit_config

@@ -84,6 +84,7 @@ variable "multi_runner_config" {
       enable_ephemeral_runners                = optional(bool, false)
       enable_job_queued_check                 = optional(bool, null)
       enable_on_demand_failover_for_errors    = optional(list(string), [])
+      custom_scale_errors                     = optional(list(string), [])
       enable_organization_runners             = optional(bool, false)
       enable_runner_binaries_syncer           = optional(bool, true)
       enable_ssm_on_runners                   = optional(bool, false)
@@ -193,6 +194,7 @@ variable "multi_runner_config" {
         enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
         enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
         enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
+        custom_scale_errors: "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
         enable_organization_runners: "Register runners to organization, instead of repo level"
         enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
         enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."

@@ -42,6 +42,7 @@ module "pool" {
       ephemeral                            = var.enable_ephemeral_runners
       enable_jit_config                    = var.enable_jit_config
       enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
+      custom_scale_errors                  = var.custom_scale_errors
       boot_time_in_minutes                 = var.runner_boot_time_in_minutes
       labels                               = var.runner_labels
       launch_template                      = aws_launch_template.runner

@@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" {
       POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests
       POWERTOOLS_TRACER_CAPTURE_ERROR          = var.tracing_config.capture_error
       ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS     = jsonencode(var.config.runner.enable_on_demand_failover_for_errors)
+      CUSTOM_SCALE_ERRORS                      = jsonencode(var.config.runner.custom_scale_errors)
     }
   }
 

@@ -32,6 +32,7 @@ variable "config" {
       ephemeral                            = bool
       enable_jit_config                    = bool
       enable_on_demand_failover_for_errors = list(string)
+      custom_scale_errors                  = list(string)
       boot_time_in_minutes                 = number
       labels                               = list(string)
       launch_template = object({

@@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" {
       SSM_CONFIG_PATH                          = "${var.ssm_paths.root}/${var.ssm_paths.config}"
       SUBNET_IDS                               = join(",", var.subnet_ids)
       ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS     = jsonencode(var.enable_on_demand_failover_for_errors)
+      CUSTOM_SCALE_ERRORS                      = jsonencode(var.custom_scale_errors)
       JOB_RETRY_CONFIG                         = jsonencode(local.job_retry_config)
     }
   }

@@ -713,6 +713,12 @@ variable "enable_on_demand_failover_for_errors" {
   default     = []
 }
 
+variable "custom_scale_errors" {
+  description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+  type        = list(string)
+  default     = []
+}
+
 variable "lambda_tags" {
   description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags."
   type        = map(string)

@@ -283,6 +283,12 @@ variable "enable_runner_on_demand_failover_for_errors" {
   default     = []
 }
 
+variable "custom_scale_errors" {
+  description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
+  type        = list(string)
+  default     = []
+}
+
 variable "enable_userdata" {
   description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
   type        = bool