From 997109f37b70d694b7f46fe1428bc4aed94fc0c8 Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Fri, 5 Dec 2025 03:12:43 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20feat:=20SSH=20connection=20p?= =?UTF-8?q?ool=20with=20backoff=20and=20singleflighting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents thundering herd issues with SSH connections by: - Adding SSHConnectionPool class with health tracking - Implementing exponential backoff (1s → 5s → 10s → 20s → 40s → 60s cap) - Singleflighting concurrent connection attempts to same host - Probing unknown connections before first use - Skipping probes for known-healthy connections Integration points: - SSHRuntime.exec() and execSSHCommand() call acquireConnection() - PTYService calls acquireConnection() before spawning SSH terminals _Generated with mux_ --- src/node/runtime/SSHRuntime.ts | 44 ++-- src/node/runtime/sshConnectionPool.test.ts | 151 +++++++++++- src/node/runtime/sshConnectionPool.ts | 272 ++++++++++++++++++++- src/node/services/ptyService.ts | 7 +- src/node/services/tools/bash.test.ts | 11 +- 5 files changed, 456 insertions(+), 29 deletions(-) diff --git a/src/node/runtime/SSHRuntime.ts b/src/node/runtime/SSHRuntime.ts index 076c95198..346c9136b 100644 --- a/src/node/runtime/SSHRuntime.ts +++ b/src/node/runtime/SSHRuntime.ts @@ -24,7 +24,7 @@ import { expandTildeForSSH, cdCommandForSSH } from "./tildeExpansion"; import { getProjectName } from "@/node/utils/runtime/helpers"; import { getErrorMessage } from "@/common/utils/errors"; import { execAsync, DisposableProcess } from "@/node/utils/disposableExec"; -import { getControlPath } from "./sshConnectionPool"; +import { getControlPath, sshConnectionPool, type SSHRuntimeConfig } from "./sshConnectionPool"; import { getBashPath } from "@/node/utils/main/bashPath"; /** @@ -40,19 +40,8 @@ const shescape = { }, }; -/** - * SSH Runtime Configuration - */ -export interface SSHRuntimeConfig { - /** SSH host (can be hostname, user@host, or SSH config alias) */ - host: string; - /** Working directory on remote host */ - srcBaseDir: string; - /** Optional: Path to SSH private key (if not using ~/.ssh/config or ssh-agent) */ - identityFile?: string; - /** Optional: SSH port (default: 22) */ - port?: number; -} +// Re-export SSHRuntimeConfig from connection pool to maintain API compatibility +export type { SSHRuntimeConfig } from "./sshConnectionPool"; /** * SSH runtime implementation that executes commands and file operations @@ -92,7 +81,6 @@ export class SSHRuntime implements Runtime { /** * Execute command over SSH with streaming I/O */ - // eslint-disable-next-line @typescript-eslint/require-await async exec(command: string, options: ExecOptions): Promise { const startTime = performance.now(); @@ -101,6 +89,10 @@ export class SSHRuntime implements Runtime { throw new RuntimeErrorClass("Operation aborted before execution", "exec"); } + // Ensure connection is healthy before executing + // This provides backoff protection and singleflighting for concurrent requests + await sshConnectionPool.acquireConnection(this.config); + // Build command parts const parts: string[] = []; @@ -218,11 +210,22 @@ export class SSHRuntime implements Runtime { resolve(EXIT_CODE_TIMEOUT); return; } - resolve(code ?? (signal ? -1 : 0)); + + const exitCode = code ?? (signal ? -1 : 0); + + // SSH exit code 255 indicates connection failure - report to pool for backoff + // This prevents thundering herd when a previously healthy host goes down + if (exitCode === 255) { + sshConnectionPool.reportFailure(this.config, "SSH connection failed (exit code 255)"); + } + + resolve(exitCode); // Cleanup runs automatically via DisposableProcess }); sshProcess.on("error", (err) => { + // Spawn errors are connection-level failures + sshConnectionPool.reportFailure(this.config, `SSH spawn error: ${err.message}`); reject(new RuntimeErrorClass(`Failed to execute SSH command: ${err.message}`, "exec", err)); }); }); @@ -406,6 +409,9 @@ export class SSHRuntime implements Runtime { * @private */ private async execSSHCommand(command: string, timeoutMs: number): Promise { + // Ensure connection is healthy before executing + await sshConnectionPool.acquireConnection(this.config, timeoutMs); + const sshArgs = this.buildSSHArgs(); sshArgs.push(this.config.host, command); @@ -440,6 +446,10 @@ export class SSHRuntime implements Runtime { if (timedOut) return; // Already rejected if (code !== 0) { + // SSH exit code 255 indicates connection failure - report to pool for backoff + if (code === 255) { + sshConnectionPool.reportFailure(this.config, "SSH connection failed (exit code 255)"); + } reject(new RuntimeErrorClass(`SSH command failed: ${stderr.trim()}`, "network")); return; } @@ -452,6 +462,8 @@ export class SSHRuntime implements Runtime { clearTimeout(timer); if (timedOut) return; // Already rejected + // Spawn errors are connection-level failures + sshConnectionPool.reportFailure(this.config, `SSH spawn error: ${getErrorMessage(err)}`); reject( new RuntimeErrorClass( `Cannot execute SSH command: ${getErrorMessage(err)}`, diff --git a/src/node/runtime/sshConnectionPool.test.ts b/src/node/runtime/sshConnectionPool.test.ts index cfc7072c6..c74fd06c2 100644 --- a/src/node/runtime/sshConnectionPool.test.ts +++ b/src/node/runtime/sshConnectionPool.test.ts @@ -1,7 +1,6 @@ import * as os from "os"; import * as path from "path"; -import { getControlPath } from "./sshConnectionPool"; -import type { SSHRuntimeConfig } from "./SSHRuntime"; +import { getControlPath, SSHConnectionPool, type SSHRuntimeConfig } from "./sshConnectionPool"; describe("sshConnectionPool", () => { describe("getControlPath", () => { @@ -134,3 +133,151 @@ describe("username isolation", () => { expect(controlPath).toMatch(/mux-ssh-[a-f0-9]{12}$/); }); }); + +describe("SSHConnectionPool", () => { + describe("health tracking", () => { + test("getConnectionHealth returns undefined for unknown connection", () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "unknown.example.com", + srcBaseDir: "/work", + }; + + expect(pool.getConnectionHealth(config)).toBeUndefined(); + }); + + test("markHealthy sets connection to healthy state", () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "test.example.com", + srcBaseDir: "/work", + }; + + pool.markHealthy(config); + const health = pool.getConnectionHealth(config); + + expect(health).toBeDefined(); + expect(health!.status).toBe("healthy"); + expect(health!.consecutiveFailures).toBe(0); + expect(health!.lastSuccess).toBeInstanceOf(Date); + }); + + test("reportFailure puts connection into backoff", () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "test.example.com", + srcBaseDir: "/work", + }; + + // Mark healthy first + pool.markHealthy(config); + expect(pool.getConnectionHealth(config)?.status).toBe("healthy"); + + // Report a failure + pool.reportFailure(config, "Connection refused"); + const health = pool.getConnectionHealth(config); + + expect(health?.status).toBe("unhealthy"); + expect(health?.consecutiveFailures).toBe(1); + expect(health?.lastError).toBe("Connection refused"); + expect(health?.backoffUntil).toBeDefined(); + }); + + test("resetBackoff clears backoff state after failed probe", async () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "nonexistent.invalid.host.test", + srcBaseDir: "/work", + }; + + // Trigger a failure via acquireConnection (will fail to connect) + await expect(pool.acquireConnection(config, 1000)).rejects.toThrow(); + + // Verify we're now in backoff + const healthBefore = pool.getConnectionHealth(config); + expect(healthBefore?.status).toBe("unhealthy"); + expect(healthBefore?.backoffUntil).toBeDefined(); + + // Reset backoff + pool.resetBackoff(config); + const healthAfter = pool.getConnectionHealth(config); + + expect(healthAfter).toBeDefined(); + expect(healthAfter!.status).toBe("unknown"); + expect(healthAfter!.consecutiveFailures).toBe(0); + expect(healthAfter!.backoffUntil).toBeUndefined(); + }); + }); + + describe("acquireConnection", () => { + test("returns immediately for known healthy connection", async () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "test.example.com", + srcBaseDir: "/work", + }; + + // Mark as healthy first + pool.markHealthy(config); + + // Should return immediately without probing + const start = Date.now(); + await pool.acquireConnection(config); + const elapsed = Date.now() - start; + + // Should be nearly instant (< 50ms) + expect(elapsed).toBeLessThan(50); + }); + + test("throws immediately when in backoff", async () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "nonexistent.invalid.host.test", + srcBaseDir: "/work", + }; + + // Trigger a failure to put connection in backoff + await expect(pool.acquireConnection(config, 1000)).rejects.toThrow(); + + // Second call should throw immediately with backoff message + await expect(pool.acquireConnection(config)).rejects.toThrow(/in backoff/); + }); + + test("getControlPath returns deterministic path", () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "test.example.com", + srcBaseDir: "/work", + }; + + const path1 = pool.getControlPath(config); + const path2 = pool.getControlPath(config); + + expect(path1).toBe(path2); + expect(path1).toBe(getControlPath(config)); + }); + }); + + describe("singleflighting", () => { + test("concurrent acquireConnection calls share same probe", async () => { + const pool = new SSHConnectionPool(); + const config: SSHRuntimeConfig = { + host: "test.example.com", + srcBaseDir: "/work", + }; + + // Mark healthy to avoid actual probe + pool.markHealthy(config); + + // Multiple concurrent calls should all succeed + const results = await Promise.all([ + pool.acquireConnection(config), + pool.acquireConnection(config), + pool.acquireConnection(config), + ]); + + // All should resolve (no errors) + expect(results).toHaveLength(3); + }); + }); +}); diff --git a/src/node/runtime/sshConnectionPool.ts b/src/node/runtime/sshConnectionPool.ts index e3e597dd0..4a45af7df 100644 --- a/src/node/runtime/sshConnectionPool.ts +++ b/src/node/runtime/sshConnectionPool.ts @@ -1,19 +1,277 @@ /** - * SSH Connection Pool - Stateless + * SSH Connection Pool * - * Generates deterministic ControlPath from SSH config to enable connection - * multiplexing across SSHRuntime instances targeting the same host. + * Manages SSH connections with: + * - Deterministic ControlPath generation for connection multiplexing + * - Health tracking to avoid re-probing known-healthy connections + * - Exponential backoff to prevent thundering herd on failures + * - Singleflighting to coalesce concurrent connection attempts * * Design: - * - Pure function: same config → same controlPath - * - No state: filesystem is the state - * - No cleanup: ControlPersist + OS handle it + * - acquireConnection() ensures a healthy connection before proceeding + * - Known-healthy connections return immediately (no probe) + * - Failed connections enter backoff before retry + * - Concurrent calls to same host share a single probe */ import * as crypto from "crypto"; import * as path from "path"; import * as os from "os"; -import type { SSHRuntimeConfig } from "./SSHRuntime"; +import { spawn } from "child_process"; +import { log } from "@/node/services/log"; + +/** + * SSH Runtime Configuration (imported type to avoid circular deps) + */ +export interface SSHRuntimeConfig { + host: string; + srcBaseDir: string; + identityFile?: string; + port?: number; +} + +/** + * Connection health status + */ +export type ConnectionStatus = "healthy" | "unhealthy" | "unknown"; + +/** + * Connection health state for a single SSH target + */ +export interface ConnectionHealth { + status: ConnectionStatus; + lastSuccess?: Date; + lastFailure?: Date; + lastError?: string; + backoffUntil?: Date; + consecutiveFailures: number; +} + +/** + * Backoff schedule in seconds: 1s → 5s → 10s → 20s → 40s → 60s (cap) + */ +const BACKOFF_SCHEDULE = [1, 5, 10, 20, 40, 60]; + +/** + * SSH Connection Pool + * + * Call acquireConnection() before any SSH operation to ensure the connection + * is healthy. This prevents thundering herd issues by: + * 1. Returning immediately for known-healthy connections + * 2. Coalescing concurrent probes via singleflighting + * 3. Enforcing backoff after failures + */ +export class SSHConnectionPool { + private health = new Map(); + private inflight = new Map>(); + + /** + * Ensure connection is healthy before proceeding. + * + * @param config SSH configuration + * @param timeoutMs Timeout for health check probe (default: 10s) + * @throws Error if connection is in backoff or health check fails + */ + async acquireConnection(config: SSHRuntimeConfig, timeoutMs = 10000): Promise { + const key = makeConnectionKey(config); + const health = this.health.get(key); + + // Check if in backoff + if (health?.backoffUntil && health.backoffUntil > new Date()) { + const remainingSecs = Math.ceil((health.backoffUntil.getTime() - Date.now()) / 1000); + throw new Error( + `SSH connection to ${config.host} is in backoff for ${remainingSecs}s. ` + + `Last error: ${health.lastError ?? "unknown"}` + ); + } + + // Return immediately if known healthy + if (health?.status === "healthy") { + log.debug(`SSH connection to ${config.host} is known healthy, skipping probe`); + return; + } + + // Check for inflight probe - singleflighting + const existing = this.inflight.get(key); + if (existing) { + log.debug(`SSH connection to ${config.host} has inflight probe, waiting...`); + return existing; + } + + // Start new probe + log.debug(`SSH connection to ${config.host} needs probe, starting health check`); + const probe = this.probeConnection(config, timeoutMs, key); + this.inflight.set(key, probe); + + try { + await probe; + } finally { + this.inflight.delete(key); + } + } + + /** + * Get current health status for a connection + */ + getConnectionHealth(config: SSHRuntimeConfig): ConnectionHealth | undefined { + const key = makeConnectionKey(config); + return this.health.get(key); + } + + /** + * Get deterministic controlPath for SSH config. + */ + getControlPath(config: SSHRuntimeConfig): string { + return getControlPath(config); + } + + /** + * Reset backoff for a connection (e.g., after user intervention) + */ + resetBackoff(config: SSHRuntimeConfig): void { + const key = makeConnectionKey(config); + const health = this.health.get(key); + if (health) { + health.backoffUntil = undefined; + health.consecutiveFailures = 0; + health.status = "unknown"; + log.info(`Reset backoff for SSH connection to ${config.host}`); + } + } + + /** + * Mark connection as healthy. + * Call after successful SSH operations to maintain health state. + */ + markHealthy(config: SSHRuntimeConfig): void { + const key = makeConnectionKey(config); + this.markHealthyByKey(key); + } + + /** + * Report a connection failure. + * Call when SSH operations fail due to connection issues (not command failures). + * This triggers backoff to prevent thundering herd on a failing host. + */ + reportFailure(config: SSHRuntimeConfig, error: string): void { + const key = makeConnectionKey(config); + this.markFailedByKey(key, error); + } + + /** + * Mark connection as healthy by key (internal use) + */ + private markHealthyByKey(key: string): void { + this.health.set(key, { + status: "healthy", + lastSuccess: new Date(), + consecutiveFailures: 0, + }); + } + + /** + * Mark connection as failed (internal use after failed probe) + */ + private markFailedByKey(key: string, error: string): void { + const current = this.health.get(key); + const failures = (current?.consecutiveFailures ?? 0) + 1; + const backoffIndex = Math.min(failures - 1, BACKOFF_SCHEDULE.length - 1); + const backoffSecs = BACKOFF_SCHEDULE[backoffIndex]; + + this.health.set(key, { + status: "unhealthy", + lastFailure: new Date(), + lastError: error, + backoffUntil: new Date(Date.now() + backoffSecs * 1000), + consecutiveFailures: failures, + }); + + log.warn( + `SSH connection failed (${failures} consecutive). Backoff for ${backoffSecs}s. Error: ${error}` + ); + } + + /** + * Probe connection health by running a simple command + */ + private async probeConnection( + config: SSHRuntimeConfig, + timeoutMs: number, + key: string + ): Promise { + const controlPath = getControlPath(config); + + const args: string[] = ["-T"]; // No PTY needed for probe + + if (config.port) { + args.push("-p", config.port.toString()); + } + + if (config.identityFile) { + args.push("-i", config.identityFile); + args.push("-o", "StrictHostKeyChecking=no"); + args.push("-o", "UserKnownHostsFile=/dev/null"); + args.push("-o", "LogLevel=ERROR"); + } + + // Connection multiplexing + args.push("-o", "ControlMaster=auto"); + args.push("-o", `ControlPath=${controlPath}`); + args.push("-o", "ControlPersist=60"); + + // Aggressive timeouts for probe + const connectTimeout = Math.min(Math.ceil(timeoutMs / 1000), 15); + args.push("-o", `ConnectTimeout=${connectTimeout}`); + args.push("-o", "ServerAliveInterval=5"); + args.push("-o", "ServerAliveCountMax=2"); + + args.push(config.host, "echo ok"); + + log.debug(`SSH probe: ssh ${args.join(" ")}`); + + return new Promise((resolve, reject) => { + const proc = spawn("ssh", args, { stdio: ["ignore", "pipe", "pipe"] }); + + let stderr = ""; + proc.stderr.on("data", (data: Buffer) => { + stderr += data.toString(); + }); + + const timeout = setTimeout(() => { + proc.kill("SIGKILL"); + const error = "SSH probe timed out"; + this.markFailedByKey(key, error); + reject(new Error(error)); + }, timeoutMs); + + proc.on("close", (code) => { + clearTimeout(timeout); + + if (code === 0) { + this.markHealthyByKey(key); + log.debug(`SSH probe to ${config.host} succeeded`); + resolve(); + } else { + const error = stderr.trim() || `SSH probe failed with code ${code ?? "unknown"}`; + this.markFailedByKey(key, error); + reject(new Error(error)); + } + }); + + proc.on("error", (err) => { + clearTimeout(timeout); + const error = `SSH probe spawn error: ${err.message}`; + this.markFailedByKey(key, error); + reject(new Error(error)); + }); + }); + } +} + +/** + * Singleton instance for application-wide use + */ +export const sshConnectionPool = new SSHConnectionPool(); /** * Get deterministic controlPath for SSH config. diff --git a/src/node/services/ptyService.ts b/src/node/services/ptyService.ts index 299ce02c3..3d50edb4f 100644 --- a/src/node/services/ptyService.ts +++ b/src/node/services/ptyService.ts @@ -17,7 +17,7 @@ import { SSHRuntime, type SSHRuntimeConfig } from "@/node/runtime/SSHRuntime"; import { LocalBaseRuntime } from "@/node/runtime/LocalBaseRuntime"; import { access } from "fs/promises"; import { constants } from "fs"; -import { getControlPath } from "@/node/runtime/sshConnectionPool"; +import { getControlPath, sshConnectionPool } from "@/node/runtime/sshConnectionPool"; import { expandTildeForSSH } from "@/node/runtime/tildeExpansion"; interface SessionData { @@ -213,6 +213,11 @@ export class PTYService { } else if (runtime instanceof SSHRuntime) { // SSH: Use node-pty to spawn SSH with local PTY (enables resize support) const sshConfig = runtime.getConfig(); + + // Ensure connection is healthy before spawning terminal + // This provides backoff protection and singleflighting for concurrent requests + await sshConnectionPool.acquireConnection(sshConfig); + const sshArgs = buildSSHArgs(sshConfig, workspacePath); log.info(`[PTY] SSH terminal for ${sessionId}: ssh ${sshArgs.join(" ")}`); diff --git a/src/node/services/tools/bash.test.ts b/src/node/services/tools/bash.test.ts index 4d4f4c9dd..66a583c02 100644 --- a/src/node/services/tools/bash.test.ts +++ b/src/node/services/tools/bash.test.ts @@ -6,6 +6,7 @@ import { BASH_MAX_TOTAL_BYTES } from "@/common/constants/toolLimits"; import * as fs from "fs"; import { TestTempDir, createTestToolConfig, getTestDeps } from "./testHelpers"; import { createRuntime } from "@/node/runtime/runtimeFactory"; +import { sshConnectionPool } from "@/node/runtime/sshConnectionPool"; import type { ToolCallOptions } from "ai"; // Mock ToolCallOptions for testing @@ -1292,11 +1293,15 @@ describe("SSH runtime redundant cd detection", () => { // Note: These tests check redundant cd detection logic only - they don't actually execute via SSH function createTestBashToolWithSSH(cwd: string) { const tempDir = new TestTempDir("test-bash-ssh"); - const sshRuntime = createRuntime({ - type: "ssh", + const sshConfig = { + type: "ssh" as const, host: "test-host", srcBaseDir: "/remote/base", - }); + }; + const sshRuntime = createRuntime(sshConfig); + + // Pre-mark connection as healthy to skip actual SSH probe in tests + sshConnectionPool.markHealthy(sshConfig); const tool = createBashTool({ ...getTestDeps(), From 1adf519e806a787e7ab72f4090b0f6cf9a09382b Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Mon, 8 Dec 2025 04:13:07 +0000 Subject: [PATCH 2/3] fix: SSH connection pool concurrency and behavioral issues - Remove srcBaseDir from connection key: workspaces on same host now share health tracking and control socket multiplexing - Fix double markFailedByKey on timeout: add timedOut flag to prevent both timeout callback and on('close') from incrementing failures - Add HEALTHY_TTL_MS (5 min): stale healthy connections get re-probed when network may have silently degraded - Fix singleflighting test: actually test concurrent probes share one failure count instead of pre-marking healthy --- src/node/runtime/sshConnectionPool.test.ts | 28 ++++++++++++---------- src/node/runtime/sshConnectionPool.ts | 25 +++++++++++++++---- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/node/runtime/sshConnectionPool.test.ts b/src/node/runtime/sshConnectionPool.test.ts index c74fd06c2..5e303c641 100644 --- a/src/node/runtime/sshConnectionPool.test.ts +++ b/src/node/runtime/sshConnectionPool.test.ts @@ -58,7 +58,9 @@ describe("sshConnectionPool", () => { expect(getControlPath(config1)).not.toBe(getControlPath(config2)); }); - test("different srcBaseDirs produce different controlPaths", () => { + test("different srcBaseDirs produce same controlPaths (connection shared)", () => { + // srcBaseDir is intentionally excluded from connection key - + // workspaces on the same host share health tracking and multiplexing const config1: SSHRuntimeConfig = { host: "test.com", srcBaseDir: "/work1", @@ -68,7 +70,7 @@ describe("sshConnectionPool", () => { srcBaseDir: "/work2", }; - expect(getControlPath(config1)).not.toBe(getControlPath(config2)); + expect(getControlPath(config1)).toBe(getControlPath(config2)); }); test("controlPath is in tmpdir with expected format", () => { @@ -262,22 +264,22 @@ describe("SSHConnectionPool", () => { test("concurrent acquireConnection calls share same probe", async () => { const pool = new SSHConnectionPool(); const config: SSHRuntimeConfig = { - host: "test.example.com", + host: "nonexistent.invalid.host.test", srcBaseDir: "/work", }; - // Mark healthy to avoid actual probe - pool.markHealthy(config); - - // Multiple concurrent calls should all succeed - const results = await Promise.all([ - pool.acquireConnection(config), - pool.acquireConnection(config), - pool.acquireConnection(config), + // All concurrent calls should share the same probe and get same result + const results = await Promise.allSettled([ + pool.acquireConnection(config, 1000), + pool.acquireConnection(config, 1000), + pool.acquireConnection(config, 1000), ]); - // All should resolve (no errors) - expect(results).toHaveLength(3); + // All should be rejected (connection fails) + expect(results.every((r) => r.status === "rejected")).toBe(true); + + // Only 1 failure should be recorded (not 3) - proves singleflighting worked + expect(pool.getConnectionHealth(config)?.consecutiveFailures).toBe(1); }); }); }); diff --git a/src/node/runtime/sshConnectionPool.ts b/src/node/runtime/sshConnectionPool.ts index 4a45af7df..942fec5bc 100644 --- a/src/node/runtime/sshConnectionPool.ts +++ b/src/node/runtime/sshConnectionPool.ts @@ -52,6 +52,12 @@ export interface ConnectionHealth { */ const BACKOFF_SCHEDULE = [1, 5, 10, 20, 40, 60]; +/** + * Time after which a "healthy" connection should be re-probed. + * Prevents stale health state when network silently degrades. + */ +const HEALTHY_TTL_MS = 5 * 60 * 1000; // 5 minutes + /** * SSH Connection Pool * @@ -85,10 +91,16 @@ export class SSHConnectionPool { ); } - // Return immediately if known healthy + // Return immediately if known healthy and not stale if (health?.status === "healthy") { - log.debug(`SSH connection to ${config.host} is known healthy, skipping probe`); - return; + const age = Date.now() - (health.lastSuccess?.getTime() ?? 0); + if (age < HEALTHY_TTL_MS) { + log.debug(`SSH connection to ${config.host} is known healthy, skipping probe`); + return; + } + log.debug( + `SSH connection to ${config.host} health is stale (${Math.round(age / 1000)}s), re-probing` + ); } // Check for inflight probe - singleflighting @@ -237,7 +249,9 @@ export class SSHConnectionPool { stderr += data.toString(); }); + let timedOut = false; const timeout = setTimeout(() => { + timedOut = true; proc.kill("SIGKILL"); const error = "SSH probe timed out"; this.markFailedByKey(key, error); @@ -246,6 +260,7 @@ export class SSHConnectionPool { proc.on("close", (code) => { clearTimeout(timeout); + if (timedOut) return; // Already handled by timeout if (code === 0) { this.markHealthyByKey(key); @@ -298,11 +313,13 @@ export function getControlPath(config: SSHRuntimeConfig): string { * Includes local username to prevent cross-user socket collisions. */ function makeConnectionKey(config: SSHRuntimeConfig): string { + // Note: srcBaseDir is intentionally excluded - connection identity is determined + // by user + host + port + key. This allows health tracking and multiplexing + // to be shared across workspaces on the same host. const parts = [ os.userInfo().username, // Include local user to prevent cross-user collisions config.host, config.port?.toString() ?? "22", - config.srcBaseDir, config.identityFile ?? "default", ]; return parts.join(":"); From 32fdb7909c6edf3c5ce47735ea2a095751aeb963 Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Mon, 8 Dec 2025 04:14:53 +0000 Subject: [PATCH 3/3] chore: reduce healthy TTL to 15s --- src/node/runtime/sshConnectionPool.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/runtime/sshConnectionPool.ts b/src/node/runtime/sshConnectionPool.ts index 942fec5bc..c6360e90b 100644 --- a/src/node/runtime/sshConnectionPool.ts +++ b/src/node/runtime/sshConnectionPool.ts @@ -56,7 +56,7 @@ const BACKOFF_SCHEDULE = [1, 5, 10, 20, 40, 60]; * Time after which a "healthy" connection should be re-probed. * Prevents stale health state when network silently degrades. */ -const HEALTHY_TTL_MS = 5 * 60 * 1000; // 5 minutes +const HEALTHY_TTL_MS = 15 * 1000; // 15 seconds /** * SSH Connection Pool