From 6e94b047e2da038dbaaed99e28207948a7749972 Mon Sep 17 00:00:00 2001 From: Brad Groux <3053586+BradGroux@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:21:14 -0500 Subject: [PATCH] fix: improve WS handshake reliability on slow-startup environments (#60075) * fix: import CHANNEL_IDS from leaf module to avoid TDZ on init (#48832) schema.ts and validation.ts imported CHANNEL_IDS from channels/registry.js, which re-exports from channels/ids.js but also imports plugins/runtime.js. When the bundler resolves this dependency graph, the re-exported CHANNEL_IDS can be undefined at the point config/validation.ts evaluates (temporal dead zone), causing 'CHANNEL_IDS is not iterable' on startup. Fix: import CHANNEL_IDS directly from channels/ids.js (the leaf module with zero heavy dependencies) and normalizeChatChannelId from channels/chat-meta.js. Fixes #48832 * fix: improve WS handshake reliability on slow-startup environments (#48736) On Windows with large dist bundles (46MB/639 files), heavy synchronous module loading blocks the event loop during CLI startup, preventing timely processing of the connect.challenge frame and causing ~80% handshake timeout failures. Changes: - Yield event loop (setImmediate) before starting WS connection in callGateway to let pending I/O drain after heavy module loading - Add OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS env var override for client-side connect challenge timeout (server already has OPENCLAW_HANDSHAKE_TIMEOUT_MS) - Include diagnostic timing in challenge timeout error messages (elapsed vs limit) for easier debugging - Add tests for env var override and resolution logic --------- Co-authored-by: Brad Groux --- src/gateway/call.ts | 5 +++++ src/gateway/client.ts | 8 +++++++- src/gateway/handshake-timeouts.test.ts | 27 ++++++++++++++++++++++++++ src/gateway/handshake-timeouts.ts | 24 ++++++++++++++++++++--- 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/gateway/call.ts b/src/gateway/call.ts index 001410a739e..582810c24d0 100644 --- a/src/gateway/call.ts +++ b/src/gateway/call.ts @@ -800,6 +800,11 @@ async function executeGatewayRequestWithScopes(params: { }): Promise { const { opts, scopes, url, token, password, tlsFingerprint, timeoutMs, safeTimerTimeoutMs } = params; + // Yield to the event loop before starting the WebSocket connection. + // On Windows with large dist bundles, heavy synchronous module loading + // can starve the event loop, preventing timely processing of the + // connect.challenge frame and causing handshake timeouts (#48736). + await new Promise((r) => setImmediate(r)); return await new Promise((resolve, reject) => { let settled = false; let ignoreClose = false; diff --git a/src/gateway/client.ts b/src/gateway/client.ts index 7a22cae41b6..eeedc4e562a 100644 --- a/src/gateway/client.ts +++ b/src/gateway/client.ts @@ -727,12 +727,18 @@ export class GatewayClient { private armConnectChallengeTimeout() { const connectChallengeTimeoutMs = resolveGatewayClientConnectChallengeTimeoutMs(this.opts); + const armedAt = Date.now(); this.clearConnectChallengeTimeout(); this.connectTimer = setTimeout(() => { if (this.connectSent || this.ws?.readyState !== WebSocket.OPEN) { return; } - this.opts.onConnectError?.(new Error("gateway connect challenge timeout")); + const elapsedMs = Date.now() - armedAt; + this.opts.onConnectError?.( + new Error( + `gateway connect challenge timeout (waited ${elapsedMs}ms, limit ${connectChallengeTimeoutMs}ms)`, + ), + ); this.ws?.close(1008, "connect challenge timeout"); }, connectChallengeTimeoutMs); } diff --git a/src/gateway/handshake-timeouts.test.ts b/src/gateway/handshake-timeouts.test.ts index 13fe14ea746..9feccdab8ef 100644 --- a/src/gateway/handshake-timeouts.test.ts +++ b/src/gateway/handshake-timeouts.test.ts @@ -2,6 +2,7 @@ import { describe, expect, test } from "vitest"; import { clampConnectChallengeTimeoutMs, DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS, + getConnectChallengeTimeoutMsFromEnv, getPreauthHandshakeTimeoutMsFromEnv, MAX_CONNECT_CHALLENGE_TIMEOUT_MS, MIN_CONNECT_CHALLENGE_TIMEOUT_MS, @@ -34,4 +35,30 @@ describe("gateway handshake timeouts", () => { }), ).toBe(20); }); + + test("getConnectChallengeTimeoutMsFromEnv reads OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS", () => { + expect(getConnectChallengeTimeoutMsFromEnv({})).toBeUndefined(); + expect( + getConnectChallengeTimeoutMsFromEnv({ OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS: "15000" }), + ).toBe(15_000); + expect( + getConnectChallengeTimeoutMsFromEnv({ OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS: "garbage" }), + ).toBeUndefined(); + }); + + test("resolveConnectChallengeTimeoutMs falls back to env override", () => { + const original = process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS; + try { + process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS = "5000"; + expect(resolveConnectChallengeTimeoutMs()).toBe(5_000); + // Explicit value still takes precedence over env + expect(resolveConnectChallengeTimeoutMs(3_000)).toBe(3_000); + } finally { + if (original === undefined) { + delete process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS; + } else { + process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS = original; + } + } + }); }); diff --git a/src/gateway/handshake-timeouts.ts b/src/gateway/handshake-timeouts.ts index 1911db22658..545d3e72d65 100644 --- a/src/gateway/handshake-timeouts.ts +++ b/src/gateway/handshake-timeouts.ts @@ -9,10 +9,28 @@ export function clampConnectChallengeTimeoutMs(timeoutMs: number): number { ); } +export function getConnectChallengeTimeoutMsFromEnv( + env: NodeJS.ProcessEnv = process.env, +): number | undefined { + const raw = env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS; + if (raw) { + const parsed = Number(raw); + if (Number.isFinite(parsed) && parsed > 0) { + return parsed; + } + } + return undefined; +} + export function resolveConnectChallengeTimeoutMs(timeoutMs?: number | null): number { - return typeof timeoutMs === "number" && Number.isFinite(timeoutMs) - ? clampConnectChallengeTimeoutMs(timeoutMs) - : DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS; + if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) { + return clampConnectChallengeTimeoutMs(timeoutMs); + } + const envOverride = getConnectChallengeTimeoutMsFromEnv(); + if (envOverride !== undefined) { + return clampConnectChallengeTimeoutMs(envOverride); + } + return DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS; } export function getPreauthHandshakeTimeoutMsFromEnv(env: NodeJS.ProcessEnv = process.env): number {