fix: improve WS handshake reliability on slow-startup environments (#60075)

* fix: import CHANNEL_IDS from leaf module to avoid TDZ on init (#48832)

schema.ts and validation.ts imported CHANNEL_IDS from channels/registry.js,
which re-exports from channels/ids.js but also imports plugins/runtime.js.
When the bundler resolves this dependency graph, the re-exported CHANNEL_IDS
can be undefined at the point config/validation.ts evaluates (temporal dead
zone), causing 'CHANNEL_IDS is not iterable' on startup.

Fix: import CHANNEL_IDS directly from channels/ids.js (the leaf module with
zero heavy dependencies) and normalizeChatChannelId from channels/chat-meta.js.

Fixes #48832

* fix: improve WS handshake reliability on slow-startup environments (#48736)

On Windows with large dist bundles (46MB/639 files), heavy synchronous
module loading blocks the event loop during CLI startup, preventing
timely processing of the connect.challenge frame and causing ~80%
handshake timeout failures.

Changes:
- Yield event loop (setImmediate) before starting WS connection in
  callGateway to let pending I/O drain after heavy module loading
- Add OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS env var override for
  client-side connect challenge timeout (server already has
  OPENCLAW_HANDSHAKE_TIMEOUT_MS)
- Include diagnostic timing in challenge timeout error messages
  (elapsed vs limit) for easier debugging
- Add tests for env var override and resolution logic

---------

Co-authored-by: Brad Groux <bradgroux@users.noreply.github.com>
This commit is contained in:
Brad Groux 2026-04-03 00:21:14 -05:00 committed by GitHub
parent 0aa98a8e3b
commit 6e94b047e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 60 additions and 4 deletions

View File

@ -800,6 +800,11 @@ async function executeGatewayRequestWithScopes<T>(params: {
}): Promise<T> {
const { opts, scopes, url, token, password, tlsFingerprint, timeoutMs, safeTimerTimeoutMs } =
params;
// Yield to the event loop before starting the WebSocket connection.
// On Windows with large dist bundles, heavy synchronous module loading
// can starve the event loop, preventing timely processing of the
// connect.challenge frame and causing handshake timeouts (#48736).
await new Promise<void>((r) => setImmediate(r));
return await new Promise<T>((resolve, reject) => {
let settled = false;
let ignoreClose = false;

View File

@ -727,12 +727,18 @@ export class GatewayClient {
private armConnectChallengeTimeout() {
const connectChallengeTimeoutMs = resolveGatewayClientConnectChallengeTimeoutMs(this.opts);
const armedAt = Date.now();
this.clearConnectChallengeTimeout();
this.connectTimer = setTimeout(() => {
if (this.connectSent || this.ws?.readyState !== WebSocket.OPEN) {
return;
}
this.opts.onConnectError?.(new Error("gateway connect challenge timeout"));
const elapsedMs = Date.now() - armedAt;
this.opts.onConnectError?.(
new Error(
`gateway connect challenge timeout (waited ${elapsedMs}ms, limit ${connectChallengeTimeoutMs}ms)`,
),
);
this.ws?.close(1008, "connect challenge timeout");
}, connectChallengeTimeoutMs);
}

View File

@ -2,6 +2,7 @@ import { describe, expect, test } from "vitest";
import {
clampConnectChallengeTimeoutMs,
DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS,
getConnectChallengeTimeoutMsFromEnv,
getPreauthHandshakeTimeoutMsFromEnv,
MAX_CONNECT_CHALLENGE_TIMEOUT_MS,
MIN_CONNECT_CHALLENGE_TIMEOUT_MS,
@ -34,4 +35,30 @@ describe("gateway handshake timeouts", () => {
}),
).toBe(20);
});
test("getConnectChallengeTimeoutMsFromEnv reads OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS", () => {
expect(getConnectChallengeTimeoutMsFromEnv({})).toBeUndefined();
expect(
getConnectChallengeTimeoutMsFromEnv({ OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS: "15000" }),
).toBe(15_000);
expect(
getConnectChallengeTimeoutMsFromEnv({ OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS: "garbage" }),
).toBeUndefined();
});
test("resolveConnectChallengeTimeoutMs falls back to env override", () => {
const original = process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS;
try {
process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS = "5000";
expect(resolveConnectChallengeTimeoutMs()).toBe(5_000);
// Explicit value still takes precedence over env
expect(resolveConnectChallengeTimeoutMs(3_000)).toBe(3_000);
} finally {
if (original === undefined) {
delete process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS;
} else {
process.env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS = original;
}
}
});
});

View File

@ -9,10 +9,28 @@ export function clampConnectChallengeTimeoutMs(timeoutMs: number): number {
);
}
export function getConnectChallengeTimeoutMsFromEnv(
env: NodeJS.ProcessEnv = process.env,
): number | undefined {
const raw = env.OPENCLAW_CONNECT_CHALLENGE_TIMEOUT_MS;
if (raw) {
const parsed = Number(raw);
if (Number.isFinite(parsed) && parsed > 0) {
return parsed;
}
}
return undefined;
}
export function resolveConnectChallengeTimeoutMs(timeoutMs?: number | null): number {
return typeof timeoutMs === "number" && Number.isFinite(timeoutMs)
? clampConnectChallengeTimeoutMs(timeoutMs)
: DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;
if (typeof timeoutMs === "number" && Number.isFinite(timeoutMs)) {
return clampConnectChallengeTimeoutMs(timeoutMs);
}
const envOverride = getConnectChallengeTimeoutMsFromEnv();
if (envOverride !== undefined) {
return clampConnectChallengeTimeoutMs(envOverride);
}
return DEFAULT_PREAUTH_HANDSHAKE_TIMEOUT_MS;
}
export function getPreauthHandshakeTimeoutMsFromEnv(env: NodeJS.ProcessEnv = process.env): number {