Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lambdas/functions/control-plane/src/aws/runners.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ export interface RunnerInputParameters {
amiIdSsmParameterName?: string;
tracingEnabled?: boolean;
onDemandFailoverOnError?: string[];
customScaleErrors?: string[];
}
15 changes: 15 additions & 0 deletions lambdas/functions/control-plane/src/aws/runners.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,19 @@ describe('create runner with errors', () => {
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
});

it('test ScaleError with custom scale error.', async () => {
createFleetMockWithErrors(['CustomAWSError']);

await expect(
createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] })),
).rejects.toBeInstanceOf(ScaleError);
expect(mockEC2Client).toHaveReceivedCommandWith(
CreateFleetCommand,
expectedCreateFleetRequest(defaultExpectedFleetRequestValues),
);
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
});

it('test ScaleError with multiple error.', async () => {
createFleetMockWithErrors(['UnfulfillableCapacity', 'SomeError']);

Expand Down Expand Up @@ -638,6 +651,7 @@ interface RunnerConfig {
amiIdSsmParameterName?: string;
tracingEnabled?: boolean;
onDemandFailoverOnError?: string[];
customScaleErrors?: string[];
}

function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
Expand All @@ -657,6 +671,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
tracingEnabled: runnerConfig.tracingEnabled,
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
customScaleErrors: runnerConfig.customScaleErrors,
};
}

Expand Down
7 changes: 6 additions & 1 deletion lambdas/functions/control-plane/src/aws/runners.ts
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ async function processFleetResult(

// Educated guess of errors that would make sense to retry based on the list
// https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html
const scaleErrors = [
const defaultScaleErrors = [
'UnfulfillableCapacity',
'MaxSpotInstanceCountExceeded',
'TargetCapacityLimitExceededException',
Expand All @@ -188,6 +188,11 @@ async function processFleetResult(
'InsufficientInstanceCapacity',
];

const scaleErrors =
runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0
? runnerParameters.customScaleErrors
: defaultScaleErrors;

if (
errors.some((e) => runnerParameters.onDemandFailoverOnError?.includes(e)) &&
runnerParameters.ec2instanceCriteria.targetCapacityType === 'spot'
Expand Down
4 changes: 4 additions & 0 deletions lambdas/functions/control-plane/src/pool/pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ export async function adjust(event: PoolEvent): Promise<void> {
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
: [];
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
: [];

const { ghesApiUrl, ghesBaseUrl } = getGitHubEnterpriseApiUrl();

Expand Down Expand Up @@ -96,6 +99,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
amiIdSsmParameterName,
tracingEnabled,
onDemandFailoverOnError,
customScaleErrors,
},
githubInstallationClient,
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = {
subnets: ['subnet-123'],
tracingEnabled: false,
onDemandFailoverOnError: [],
customScaleErrors: [],
};
let expectedRunnerParams: RunnerInputParameters;

Expand All @@ -115,6 +116,7 @@ function setDefaults() {
process.env.INSTANCE_TYPES = 'm5.large';
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
process.env.ENABLE_ON_DEMAND_FAILOVER = undefined;
process.env.CUSTOM_SCALE_ERRORS = undefined;
}

beforeEach(() => {
Expand Down Expand Up @@ -587,6 +589,16 @@ describe('scaleUp with public GH', () => {
});
});

it('creates a runner with correct config and labels and custom scale errors enabled.', async () => {
process.env.RUNNER_LABELS = 'label1,label2';
process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']);
await scaleUpModule.scaleUp('aws:sqs', TEST_DATA);
expect(createRunner).toBeCalledWith({
...expectedRunnerParams,
customScaleErrors: ['RequestLimitExceeded'],
});
});

it('creates a runner and ensure the group argument is ignored', async () => {
process.env.RUNNER_LABELS = 'label1,label2';
process.env.RUNNER_GROUP_NAME = 'TEST_GROUP_IGNORED';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ interface CreateEC2RunnerConfig {
amiIdSsmParameterName?: string;
tracingEnabled?: boolean;
onDemandFailoverOnError?: string[];
customScaleErrors?: string[];
}

function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) {
Expand Down Expand Up @@ -251,6 +252,9 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS
? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string])
: [];
const customScaleErrors = process.env.CUSTOM_SCALE_ERRORS
? (JSON.parse(process.env.CUSTOM_SCALE_ERRORS) as [string])
: [];

if (ephemeralEnabled && payload.eventType !== 'workflow_job') {
logger.warn(`${payload.eventType} event is not supported in combination with ephemeral runners.`);
Expand Down Expand Up @@ -335,6 +339,7 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage
amiIdSsmParameterName,
tracingEnabled,
onDemandFailoverOnError,
customScaleErrors,
},
githubInstallationClient,
);
Expand Down
1 change: 1 addition & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ module "runners" {
enable_jit_config = var.enable_jit_config
enable_job_queued_check = var.enable_job_queued_check
enable_on_demand_failover_for_errors = var.enable_runner_on_demand_failover_for_errors
custom_scale_errors = var.custom_scale_errors
disable_runner_autoupdate = var.disable_runner_autoupdate
enable_managed_runner_security_group = var.enable_managed_runner_security_group
enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring
Expand Down
1 change: 1 addition & 0 deletions modules/multi-runner/runners.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ module "runners" {
github_app_parameters = local.github_app_parameters
ebs_optimized = each.value.runner_config.ebs_optimized
enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors
custom_scale_errors = each.value.runner_config.custom_scale_errors
enable_organization_runners = each.value.runner_config.enable_organization_runners
enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners
enable_jit_config = each.value.runner_config.enable_jit_config
Expand Down
2 changes: 2 additions & 0 deletions modules/multi-runner/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ variable "multi_runner_config" {
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
custom_scale_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
Expand Down Expand Up @@ -193,6 +194,7 @@ variable "multi_runner_config" {
enable_ephemeral_runners: "Enable ephemeral runners, runners will only be used once."
enable_job_queued_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT configuration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable_on_demand_failover_for_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
custom_scale_errors: "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
enable_organization_runners: "Register runners to organization, instead of repo level"
enable_runner_binaries_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable_ssm_on_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ module "pool" {
ephemeral = var.enable_ephemeral_runners
enable_jit_config = var.enable_jit_config
enable_on_demand_failover_for_errors = var.enable_on_demand_failover_for_errors
custom_scale_errors = var.custom_scale_errors
boot_time_in_minutes = var.runner_boot_time_in_minutes
labels = var.runner_labels
launch_template = aws_launch_template.runner
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ resource "aws_lambda_function" "pool" {
POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests
POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.config.runner.enable_on_demand_failover_for_errors)
CUSTOM_SCALE_ERRORS = jsonencode(var.config.runner.custom_scale_errors)
}
}

Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ variable "config" {
ephemeral = bool
enable_jit_config = bool
enable_on_demand_failover_for_errors = list(string)
custom_scale_errors = list(string)
boot_time_in_minutes = number
labels = list(string)
launch_template = object({
Expand Down
1 change: 1 addition & 0 deletions modules/runners/scale-up.tf
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ resource "aws_lambda_function" "scale_up" {
SSM_CONFIG_PATH = "${var.ssm_paths.root}/${var.ssm_paths.config}"
SUBNET_IDS = join(",", var.subnet_ids)
ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS = jsonencode(var.enable_on_demand_failover_for_errors)
CUSTOM_SCALE_ERRORS = jsonencode(var.custom_scale_errors)
JOB_RETRY_CONFIG = jsonencode(local.job_retry_config)
}
}
Expand Down
6 changes: 6 additions & 0 deletions modules/runners/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,12 @@ variable "enable_on_demand_failover_for_errors" {
default = []
}

variable "custom_scale_errors" {
description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
type = list(string)
default = []
}

variable "lambda_tags" {
description = "Map of tags that will be added to all the lambda function resources. Note these are additional tags to the default tags."
type = map(string)
Expand Down
6 changes: 6 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,12 @@ variable "enable_runner_on_demand_failover_for_errors" {
default = []
}

variable "custom_scale_errors" {
description = "List of aws error codesthat should trigger retry during scale up. This list will replace the default errors defined in the variable `defaultScaleErrors` in https://github.com/github-aws-runners/terraform-aws-github-runner/blob/main/lambdas/functions/control-plane/src/aws/runners.ts"
type = list(string)
default = []
}

variable "enable_userdata" {
description = "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
type = bool
Expand Down
Loading