Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,35 @@ describe('Scale down runners', () => {
await expect(scaleDown()).resolves.not.toThrow();
});

it(`Should not terminate instance when de-registration throws an error.`, async () => {
// setup - runner should NOT be terminated because de-registration fails
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];

const error502 = new RequestError('Server Error', 502, {
request: {
method: 'DELETE',
url: 'https://api.github.com/test',
headers: {},
},
});

mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
throw error502;
});
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
throw error502;
});

mockGitHubRunners(runners);
mockAwsRunners(runners);

// act
await expect(scaleDown()).resolves.not.toThrow();

// assert - should NOT terminate since de-registration failed
expect(terminateRunner).not.toHaveBeenCalled();
});

const evictionStrategies = ['oldest_first', 'newest_first'];
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
const defaultConfig = {
Expand Down
97 changes: 64 additions & 33 deletions lambdas/functions/control-plane/src/scale-runners/scale-down.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,20 @@ async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> {

logger.debug(`[listGithubRunners] Cache miss for ${key}`);
const client = await getOrCreateOctokit(runner);
const runners =
runner.type === 'Org'
? await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
org: runner.owner,
per_page: 100,
})
: await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
owner: runner.owner.split('/')[0],
repo: runner.owner.split('/')[1],
per_page: 100,
});
let runners;
if (runner.type === 'Org') {
runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
org: runner.owner,
per_page: 100,
});
} else {
const [owner, repo] = runner.owner.split('/');
runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
owner,
repo,
per_page: 100,
});
}
githubCache.runners.set(key, runners);
logger.debug(`[listGithubRunners] Cache set for ${key}`);
logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`);
Expand All @@ -127,8 +130,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
return launchTimePlusMinimum < now;
}

async function deleteGitHubRunner(
githubInstallationClient: Octokit,
ec2runner: RunnerInfo,
ghRunnerId: number,
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
try {
let response;
if (ec2runner.type === 'Org') {
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
});
} else {
const [owner, repo] = ec2runner.owner.split('/');
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner,
repo,
});
}
return { ghRunnerId, status: response.status, success: response.status === 204 };
} catch (error) {
logger.error(
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` +
`Error: ${error instanceof Error ? error.message : String(error)}`,
{ error },
);
Comment thread
shivdesh marked this conversation as resolved.
return { ghRunnerId, status: 0, success: false };
}
}

async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
const githubAppClient = await getOrCreateOctokit(ec2runner);
const githubInstallationClient = await getOrCreateOctokit(ec2runner);
try {
const runnerList = ec2runner as unknown as RunnerList;
if (runnerList.bypassRemoval) {
Expand All @@ -141,41 +175,38 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId);
}),
);

if (states.every((busy) => busy === false)) {
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
const results = await Promise.all(
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)),
);

if (statuses.every((status) => status == 204)) {
const allSucceeded = results.every((r) => r.success);
const failedRunners = results.filter((r) => !r.success);

if (allSucceeded) {
await terminateRunner(ec2runner.instanceId);
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
} else {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
// Only terminate EC2 if we successfully de-registered from GitHub
// Otherwise, leave the instance running so the next scale-down cycle can retry
logger.error(
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
);
Comment thread
shivdesh marked this conversation as resolved.
}
} else {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
}
} catch (e) {
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {
error: e as Error,
});
logger.error(
`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`,
{ error: e },
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,24 @@ describe('scaleUp with GHES', () => {
expect(mockOctokit.actions.createRegistrationTokenForRepo).not.toBeCalled();
});

it('does not create runners when current runners exceed maximum (race condition)', async () => {
process.env.RUNNERS_MAXIMUM_COUNT = '5';
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
// Simulate race condition where pool lambda created more runners than max
mockListRunners.mockImplementation(async () =>
Array.from({ length: 10 }, (_, i) => ({
instanceId: `i-${i}`,
launchTime: new Date(),
type: 'Org',
owner: TEST_DATA_SINGLE.repositoryOwner,
})),
);
await scaleUpModule.scaleUp(TEST_DATA);
// Should not attempt to create runners (would be negative without fix)
expect(createRunner).not.toBeCalled();
expect(mockOctokit.actions.createRegistrationTokenForOrg).not.toBeCalled();
});

it('does create a runner if maximum is set to -1', async () => {
process.env.RUNNERS_MAXIMUM_COUNT = '-1';
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,14 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
});

// Calculate how many runners we want to create.
// Use Math.max(0, ...) to ensure we never attempt to create a negative number of runners,
// which can happen when currentRunners exceeds maximumRunners due to pool/scale-up race conditions.
const newRunners =
maximumRunners === -1
? // If we don't have an upper limit, scale up by the number of new jobs.
scaleUp
: // Otherwise, we do have a limit, so work out if `scaleUp` would exceed it.
Math.min(scaleUp, maximumRunners - currentRunners);
Math.max(0, Math.min(scaleUp, maximumRunners - currentRunners));

const missingInstanceCount = Math.max(0, scaleUp - newRunners);

Expand Down
Loading