From 8db6fcca777ac751597b1290a201d0df6161f9f2 Mon Sep 17 00:00:00 2001 From: Sally O'Malley Date: Sat, 14 Mar 2026 14:27:52 -0400 Subject: [PATCH] fix(gateway/cli): relax local backend self-pairing and harden launchd restarts (#46290) Signed-off-by: sallyom --- src/cli/daemon-cli/restart-health.ts | 2 +- src/cli/daemon-cli/status.gather.test.ts | 44 +++++++ src/cli/daemon-cli/status.gather.ts | 21 ++++ src/cli/daemon-cli/status.print.test.ts | 116 ++++++++++++++++++ src/cli/daemon-cli/status.print.ts | 19 +++ src/daemon/launchd.test.ts | 41 ++++++- src/daemon/launchd.ts | 44 +++++++ .../server.auth.compat-baseline.test.ts | 47 +++++++ .../handshake-auth-helpers.test.ts | 11 +- .../ws-connection/handshake-auth-helpers.ts | 7 +- 10 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 src/cli/daemon-cli/status.print.test.ts diff --git a/src/cli/daemon-cli/restart-health.ts b/src/cli/daemon-cli/restart-health.ts index 9bfe3476ee6..43102cedee8 100644 --- a/src/cli/daemon-cli/restart-health.ts +++ b/src/cli/daemon-cli/restart-health.ts @@ -182,7 +182,7 @@ export async function inspectGatewayRestart(params: { return true; } if (runtimePid == null) { - return true; + return false; } return !listenerOwnedByRuntimePid({ listener, runtimePid }); }) diff --git a/src/cli/daemon-cli/status.gather.test.ts b/src/cli/daemon-cli/status.gather.test.ts index b0c08715abe..27b53753eda 100644 --- a/src/cli/daemon-cli/status.gather.test.ts +++ b/src/cli/daemon-cli/status.gather.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { captureEnv } from "../../test-utils/env.js"; +import type { GatewayRestartSnapshot } from "./restart-health.js"; const callGatewayStatusProbe = vi.fn(async (_opts?: unknown) => ({ ok: true as const })); const loadGatewayTlsRuntime = vi.fn(async (_cfg?: unknown) => ({ @@ -18,6 +19,14 @@ const readLastGatewayErrorLine = vi.fn(async (_env?: NodeJS.ProcessEnv) => null) const auditGatewayServiceConfig = vi.fn(async (_opts?: unknown) => undefined); const serviceIsLoaded = vi.fn(async (_opts?: unknown) => true); const serviceReadRuntime = vi.fn(async (_env?: NodeJS.ProcessEnv) => ({ status: "running" })); +const inspectGatewayRestart = vi.fn<(opts?: unknown) => Promise>( + async (_opts?: unknown) => ({ + runtime: { status: "running", pid: 1234 }, + portUsage: { port: 19001, status: "busy", listeners: [], hints: [] }, + healthy: true, + staleGatewayPids: [], + }), +); const serviceReadCommand = vi.fn< (env?: NodeJS.ProcessEnv) => Promise<{ programArguments: string[]; @@ -117,6 +126,10 @@ vi.mock("./probe.js", () => ({ probeGatewayStatus: (opts: unknown) => callGatewayStatusProbe(opts), })); +vi.mock("./restart-health.js", () => ({ + inspectGatewayRestart: (opts: unknown) => inspectGatewayRestart(opts), +})); + const { gatherDaemonStatus } = await import("./status.gather.js"); describe("gatherDaemonStatus", () => { @@ -139,6 +152,7 @@ describe("gatherDaemonStatus", () => { delete process.env.DAEMON_GATEWAY_PASSWORD; callGatewayStatusProbe.mockClear(); loadGatewayTlsRuntime.mockClear(); + inspectGatewayRestart.mockClear(); daemonLoadedConfig = { gateway: { bind: "lan", @@ -362,4 +376,34 @@ describe("gatherDaemonStatus", () => { expect(callGatewayStatusProbe).not.toHaveBeenCalled(); expect(status.rpc).toBeUndefined(); }); + + it("surfaces stale gateway listener pids from restart health inspection", async () => { + inspectGatewayRestart.mockResolvedValueOnce({ + runtime: { status: "running", pid: 8000 }, + portUsage: { + port: 19001, + status: "busy", + listeners: [{ pid: 9000, ppid: 8999, commandLine: "openclaw-gateway" }], + hints: [], + }, + healthy: false, + staleGatewayPids: [9000], + }); + + const status = await gatherDaemonStatus({ + rpc: {}, + probe: true, + deep: false, + }); + + expect(inspectGatewayRestart).toHaveBeenCalledWith( + expect.objectContaining({ + port: 19001, + }), + ); + expect(status.health).toEqual({ + healthy: false, + staleGatewayPids: [9000], + }); + }); }); diff --git a/src/cli/daemon-cli/status.gather.ts b/src/cli/daemon-cli/status.gather.ts index ef15a377438..707a908b1f6 100644 --- a/src/cli/daemon-cli/status.gather.ts +++ b/src/cli/daemon-cli/status.gather.ts @@ -29,6 +29,7 @@ import { import { pickPrimaryTailnetIPv4 } from "../../infra/tailnet.js"; import { loadGatewayTlsRuntime } from "../../infra/tls/gateway.js"; import { probeGatewayStatus } from "./probe.js"; +import { inspectGatewayRestart } from "./restart-health.js"; import { normalizeListenerAddress, parsePortFromArgs, pickProbeHostForBind } from "./shared.js"; import type { GatewayRpcOpts } from "./types.js"; @@ -112,6 +113,10 @@ export type DaemonStatus = { error?: string; url?: string; }; + health?: { + healthy: boolean; + staleGatewayPids: number[]; + }; extraServices: Array<{ label: string; detail: string; scope: string }>; }; @@ -331,6 +336,14 @@ export async function gatherDaemonStatus( configPath: daemonConfigSummary.path, }) : undefined; + const health = + opts.probe && loaded + ? await inspectGatewayRestart({ + service, + port: daemonPort, + env: serviceEnv, + }).catch(() => undefined) + : undefined; let lastError: string | undefined; if (loaded && runtime?.status === "running" && portStatus && portStatus.status !== "busy") { @@ -357,6 +370,14 @@ export async function gatherDaemonStatus( ...(portCliStatus ? { portCli: portCliStatus } : {}), lastError, ...(rpc ? { rpc: { ...rpc, url: gateway.probeUrl } } : {}), + ...(health + ? { + health: { + healthy: health.healthy, + staleGatewayPids: health.staleGatewayPids, + }, + } + : {}), extraServices, }; } diff --git a/src/cli/daemon-cli/status.print.test.ts b/src/cli/daemon-cli/status.print.test.ts new file mode 100644 index 00000000000..e99fa84de37 --- /dev/null +++ b/src/cli/daemon-cli/status.print.test.ts @@ -0,0 +1,116 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const runtime = vi.hoisted(() => ({ + log: vi.fn<(line: string) => void>(), + error: vi.fn<(line: string) => void>(), +})); + +vi.mock("../../runtime.js", () => ({ + defaultRuntime: runtime, +})); + +vi.mock("../../terminal/theme.js", () => ({ + colorize: (_rich: boolean, _theme: unknown, text: string) => text, +})); + +vi.mock("../../commands/onboard-helpers.js", () => ({ + resolveControlUiLinks: () => ({ httpUrl: "http://127.0.0.1:18789" }), +})); + +vi.mock("../../daemon/inspect.js", () => ({ + renderGatewayServiceCleanupHints: () => [], +})); + +vi.mock("../../daemon/launchd.js", () => ({ + resolveGatewayLogPaths: () => ({ + stdoutPath: "/tmp/gateway.out.log", + stderrPath: "/tmp/gateway.err.log", + }), +})); + +vi.mock("../../daemon/systemd-hints.js", () => ({ + isSystemdUnavailableDetail: () => false, + renderSystemdUnavailableHints: () => [], +})); + +vi.mock("../../infra/wsl.js", () => ({ + isWSLEnv: () => false, +})); + +vi.mock("../../logging.js", () => ({ + getResolvedLoggerSettings: () => ({ file: "/tmp/openclaw.log" }), +})); + +vi.mock("./shared.js", () => ({ + createCliStatusTextStyles: () => ({ + rich: false, + label: (text: string) => text, + accent: (text: string) => text, + infoText: (text: string) => text, + okText: (text: string) => text, + warnText: (text: string) => text, + errorText: (text: string) => text, + }), + filterDaemonEnv: () => ({}), + formatRuntimeStatus: () => "running (pid 8000)", + resolveRuntimeStatusColor: () => "", + renderRuntimeHints: () => [], + safeDaemonEnv: () => [], +})); + +vi.mock("./status.gather.js", () => ({ + renderPortDiagnosticsForCli: () => [], + resolvePortListeningAddresses: () => ["127.0.0.1:18789"], +})); + +const { printDaemonStatus } = await import("./status.print.js"); + +describe("printDaemonStatus", () => { + beforeEach(() => { + runtime.log.mockReset(); + runtime.error.mockReset(); + }); + + it("prints stale gateway pid guidance when runtime does not own the listener", () => { + printDaemonStatus( + { + service: { + label: "LaunchAgent", + loaded: true, + loadedText: "loaded", + notLoadedText: "not loaded", + runtime: { status: "running", pid: 8000 }, + }, + gateway: { + bindMode: "loopback", + bindHost: "127.0.0.1", + port: 18789, + portSource: "env/config", + probeUrl: "ws://127.0.0.1:18789", + }, + port: { + port: 18789, + status: "busy", + listeners: [{ pid: 9000, ppid: 8999, address: "127.0.0.1:18789" }], + hints: [], + }, + rpc: { + ok: false, + error: "gateway closed (1006 abnormal closure (no close frame))", + url: "ws://127.0.0.1:18789", + }, + health: { + healthy: false, + staleGatewayPids: [9000], + }, + extraServices: [], + }, + { json: false }, + ); + + expect(runtime.error).toHaveBeenCalledWith( + expect.stringContaining("Gateway runtime PID does not own the listening port"), + ); + expect(runtime.error).toHaveBeenCalledWith(expect.stringContaining("openclaw gateway restart")); + }); +}); diff --git a/src/cli/daemon-cli/status.print.ts b/src/cli/daemon-cli/status.print.ts index ce9934f7ed4..91348d10d4a 100644 --- a/src/cli/daemon-cli/status.print.ts +++ b/src/cli/daemon-cli/status.print.ts @@ -194,6 +194,25 @@ export function printDaemonStatus(status: DaemonStatus, opts: { json: boolean }) spacer(); } + if ( + status.health && + status.health.staleGatewayPids.length > 0 && + service.runtime?.status === "running" && + typeof service.runtime.pid === "number" + ) { + defaultRuntime.error( + errorText( + `Gateway runtime PID does not own the listening port. Other gateway process(es) are listening: ${status.health.staleGatewayPids.join(", ")}`, + ), + ); + defaultRuntime.error( + errorText( + `Fix: run ${formatCliCommand("openclaw gateway restart")} and re-check with ${formatCliCommand("openclaw gateway status --deep")}.`, + ), + ); + spacer(); + } + const systemdUnavailable = process.platform === "linux" && isSystemdUnavailableDetail(service.runtime?.detail); if (systemdUnavailable) { diff --git a/src/daemon/launchd.test.ts b/src/daemon/launchd.test.ts index 4c624cfeec1..341f071de91 100644 --- a/src/daemon/launchd.test.ts +++ b/src/daemon/launchd.test.ts @@ -29,6 +29,9 @@ const launchdRestartHandoffState = vi.hoisted(() => ({ isCurrentProcessLaunchdServiceLabel: vi.fn<(label: string) => boolean>(() => false), scheduleDetachedLaunchdRestartHandoff: vi.fn((_params: unknown) => ({ ok: true, pid: 7331 })), })); +const cleanStaleGatewayProcessesSync = vi.hoisted(() => + vi.fn<(port?: number) => number[]>(() => []), +); const defaultProgramArguments = ["node", "-e", "process.exit(0)"]; function expectLaunchctlEnableBootstrapOrder(env: Record) { @@ -89,6 +92,10 @@ vi.mock("./launchd-restart-handoff.js", () => ({ launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff(params), })); +vi.mock("../infra/restart-stale-pids.js", () => ({ + cleanStaleGatewayProcessesSync: (port?: number) => cleanStaleGatewayProcessesSync(port), +})); + vi.mock("node:fs/promises", async (importOriginal) => { const actual = await importOriginal(); const wrapped = { @@ -151,6 +158,8 @@ beforeEach(() => { state.dirModes.clear(); state.files.clear(); state.fileModes.clear(); + cleanStaleGatewayProcessesSync.mockReset(); + cleanStaleGatewayProcessesSync.mockReturnValue([]); launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReset(); launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReturnValue(false); launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff.mockReset(); @@ -328,7 +337,10 @@ describe("launchd install", () => { }); it("restarts LaunchAgent with kickstart and no bootout", async () => { - const env = createDefaultLaunchdEnv(); + const env = { + ...createDefaultLaunchdEnv(), + OPENCLAW_GATEWAY_PORT: "18789", + }; const result = await restartLaunchAgent({ env, stdout: new PassThrough(), @@ -338,11 +350,38 @@ describe("launchd install", () => { const label = "ai.openclaw.gateway"; const serviceId = `${domain}/${label}`; expect(result).toEqual({ outcome: "completed" }); + expect(cleanStaleGatewayProcessesSync).toHaveBeenCalledWith(18789); expect(state.launchctlCalls).toContainEqual(["kickstart", "-k", serviceId]); expect(state.launchctlCalls.some((call) => call[0] === "bootout")).toBe(false); expect(state.launchctlCalls.some((call) => call[0] === "bootstrap")).toBe(false); }); + it("uses the configured gateway port for stale cleanup", async () => { + const env = { + ...createDefaultLaunchdEnv(), + OPENCLAW_GATEWAY_PORT: "19001", + }; + + await restartLaunchAgent({ + env, + stdout: new PassThrough(), + }); + + expect(cleanStaleGatewayProcessesSync).toHaveBeenCalledWith(19001); + }); + + it("skips stale cleanup when no explicit launch agent port can be resolved", async () => { + const env = createDefaultLaunchdEnv(); + state.files.clear(); + + await restartLaunchAgent({ + env, + stdout: new PassThrough(), + }); + + expect(cleanStaleGatewayProcessesSync).not.toHaveBeenCalled(); + }); + it("falls back to bootstrap when kickstart cannot find the service", async () => { const env = createDefaultLaunchdEnv(); state.kickstartError = "Could not find service"; diff --git a/src/daemon/launchd.ts b/src/daemon/launchd.ts index 29d0933558c..6c190ccd213 100644 --- a/src/daemon/launchd.ts +++ b/src/daemon/launchd.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js"; +import { cleanStaleGatewayProcessesSync } from "../infra/restart-stale-pids.js"; import { GATEWAY_LAUNCH_AGENT_LABEL, resolveGatewayServiceDescription, @@ -113,6 +114,44 @@ async function execLaunchctl( return await execFileUtf8(file, fileArgs, isWindows ? { windowsHide: true } : {}); } +function parseGatewayPortFromProgramArguments( + programArguments: string[] | undefined, +): number | null { + if (!Array.isArray(programArguments) || programArguments.length === 0) { + return null; + } + for (let index = 0; index < programArguments.length; index += 1) { + const current = programArguments[index]?.trim(); + if (!current) { + continue; + } + if (current === "--port") { + const next = parseStrictPositiveInteger(programArguments[index + 1] ?? ""); + if (next !== undefined) { + return next; + } + continue; + } + if (current.startsWith("--port=")) { + const value = parseStrictPositiveInteger(current.slice("--port=".length)); + if (value !== undefined) { + return value; + } + } + } + return null; +} + +async function resolveLaunchAgentGatewayPort(env: GatewayServiceEnv): Promise { + const command = await readLaunchAgentProgramArguments(env).catch(() => null); + const fromArgs = parseGatewayPortFromProgramArguments(command?.programArguments); + if (fromArgs !== null) { + return fromArgs; + } + const fromEnv = parseStrictPositiveInteger(env.OPENCLAW_GATEWAY_PORT ?? ""); + return fromEnv ?? null; +} + function resolveGuiDomain(): string { if (typeof process.getuid !== "function") { return "gui/501"; @@ -514,6 +553,11 @@ export async function restartLaunchAgent({ return { outcome: "scheduled" }; } + const cleanupPort = await resolveLaunchAgentGatewayPort(serviceEnv); + if (cleanupPort !== null) { + cleanStaleGatewayProcessesSync(cleanupPort); + } + const start = await execLaunchctl(["kickstart", "-k", serviceTarget]); if (start.code === 0) { writeLaunchAgentActionLine(stdout, "Restarted LaunchAgent", serviceTarget); diff --git a/src/gateway/server.auth.compat-baseline.test.ts b/src/gateway/server.auth.compat-baseline.test.ts index a606feab909..27fc4abc72d 100644 --- a/src/gateway/server.auth.compat-baseline.test.ts +++ b/src/gateway/server.auth.compat-baseline.test.ts @@ -1,5 +1,8 @@ +import os from "node:os"; +import path from "node:path"; import { afterAll, beforeAll, describe, expect, test } from "vitest"; import { + BACKEND_GATEWAY_CLIENT, connectReq, CONTROL_UI_CLIENT, ConnectErrorDetailCodes, @@ -144,6 +147,50 @@ describe("gateway auth compatibility baseline", () => { ws.close(); } }); + + test("keeps local backend device-token reconnects out of pairing", async () => { + const identityPath = path.join( + os.tmpdir(), + `openclaw-backend-device-${process.pid}-${port}.json`, + ); + const { loadOrCreateDeviceIdentity, publicKeyRawBase64UrlFromPem } = + await import("../infra/device-identity.js"); + const { approveDevicePairing, requestDevicePairing, rotateDeviceToken } = + await import("../infra/device-pairing.js"); + + const identity = loadOrCreateDeviceIdentity(identityPath); + const pending = await requestDevicePairing({ + deviceId: identity.deviceId, + publicKey: publicKeyRawBase64UrlFromPem(identity.publicKeyPem), + clientId: BACKEND_GATEWAY_CLIENT.id, + clientMode: BACKEND_GATEWAY_CLIENT.mode, + role: "operator", + scopes: ["operator.admin"], + }); + await approveDevicePairing(pending.request.requestId); + + const rotated = await rotateDeviceToken({ + deviceId: identity.deviceId, + role: "operator", + scopes: ["operator.admin"], + }); + expect(rotated?.token).toBeTruthy(); + + const ws = await openWs(port); + try { + const res = await connectReq(ws, { + skipDefaultAuth: true, + client: { ...BACKEND_GATEWAY_CLIENT }, + deviceIdentityPath: identityPath, + deviceToken: String(rotated?.token ?? ""), + scopes: ["operator.admin"], + }); + expect(res.ok).toBe(true); + expect((res.payload as { type?: string } | undefined)?.type).toBe("hello-ok"); + } finally { + ws.close(); + } + }); }); describe("password mode", () => { diff --git a/src/gateway/server/ws-connection/handshake-auth-helpers.test.ts b/src/gateway/server/ws-connection/handshake-auth-helpers.test.ts index 8b7b7e521fd..cc064e35631 100644 --- a/src/gateway/server/ws-connection/handshake-auth-helpers.test.ts +++ b/src/gateway/server/ws-connection/handshake-auth-helpers.test.ts @@ -89,7 +89,7 @@ describe("handshake auth helpers", () => { ).toBe(false); }); - it("skips backend self-pairing only for local shared-secret backend clients", () => { + it("skips backend self-pairing for local trusted backend clients", () => { const connectParams = { client: { id: GATEWAY_CLIENT_IDS.GATEWAY_CLIENT, @@ -106,6 +106,15 @@ describe("handshake auth helpers", () => { authMethod: "token", }), ).toBe(true); + expect( + shouldSkipBackendSelfPairing({ + connectParams, + isLocalClient: true, + hasBrowserOriginHeader: false, + sharedAuthOk: false, + authMethod: "device-token", + }), + ).toBe(true); expect( shouldSkipBackendSelfPairing({ connectParams, diff --git a/src/gateway/server/ws-connection/handshake-auth-helpers.ts b/src/gateway/server/ws-connection/handshake-auth-helpers.ts index 8529cf55082..20dba4ca2a0 100644 --- a/src/gateway/server/ws-connection/handshake-auth-helpers.ts +++ b/src/gateway/server/ws-connection/handshake-auth-helpers.ts @@ -74,11 +74,14 @@ export function shouldSkipBackendSelfPairing(params: { return false; } const usesSharedSecretAuth = params.authMethod === "token" || params.authMethod === "password"; + const usesDeviceTokenAuth = params.authMethod === "device-token"; + // `authMethod === "device-token"` only reaches this helper after the caller + // has already accepted auth (`authOk === true`), so a separate + // `deviceTokenAuthOk` flag would be redundant here. return ( params.isLocalClient && !params.hasBrowserOriginHeader && - params.sharedAuthOk && - usesSharedSecretAuth + ((params.sharedAuthOk && usesSharedSecretAuth) || usesDeviceTokenAuth) ); }