fix(gateway/cli): relax local backend self-pairing and harden launchd restarts (#46290)

Signed-off-by: sallyom <somalley@redhat.com>
This commit is contained in:
Sally O'Malley 2026-03-14 14:27:52 -04:00 committed by GitHub
parent ac29edf6c3
commit 8db6fcca77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 347 additions and 5 deletions

View File

@ -182,7 +182,7 @@ export async function inspectGatewayRestart(params: {
return true; return true;
} }
if (runtimePid == null) { if (runtimePid == null) {
return true; return false;
} }
return !listenerOwnedByRuntimePid({ listener, runtimePid }); return !listenerOwnedByRuntimePid({ listener, runtimePid });
}) })

View File

@ -1,5 +1,6 @@
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { captureEnv } from "../../test-utils/env.js"; import { captureEnv } from "../../test-utils/env.js";
import type { GatewayRestartSnapshot } from "./restart-health.js";
const callGatewayStatusProbe = vi.fn(async (_opts?: unknown) => ({ ok: true as const })); const callGatewayStatusProbe = vi.fn(async (_opts?: unknown) => ({ ok: true as const }));
const loadGatewayTlsRuntime = vi.fn(async (_cfg?: unknown) => ({ const loadGatewayTlsRuntime = vi.fn(async (_cfg?: unknown) => ({
@ -18,6 +19,14 @@ const readLastGatewayErrorLine = vi.fn(async (_env?: NodeJS.ProcessEnv) => null)
const auditGatewayServiceConfig = vi.fn(async (_opts?: unknown) => undefined); const auditGatewayServiceConfig = vi.fn(async (_opts?: unknown) => undefined);
const serviceIsLoaded = vi.fn(async (_opts?: unknown) => true); const serviceIsLoaded = vi.fn(async (_opts?: unknown) => true);
const serviceReadRuntime = vi.fn(async (_env?: NodeJS.ProcessEnv) => ({ status: "running" })); const serviceReadRuntime = vi.fn(async (_env?: NodeJS.ProcessEnv) => ({ status: "running" }));
const inspectGatewayRestart = vi.fn<(opts?: unknown) => Promise<GatewayRestartSnapshot>>(
async (_opts?: unknown) => ({
runtime: { status: "running", pid: 1234 },
portUsage: { port: 19001, status: "busy", listeners: [], hints: [] },
healthy: true,
staleGatewayPids: [],
}),
);
const serviceReadCommand = vi.fn< const serviceReadCommand = vi.fn<
(env?: NodeJS.ProcessEnv) => Promise<{ (env?: NodeJS.ProcessEnv) => Promise<{
programArguments: string[]; programArguments: string[];
@ -117,6 +126,10 @@ vi.mock("./probe.js", () => ({
probeGatewayStatus: (opts: unknown) => callGatewayStatusProbe(opts), probeGatewayStatus: (opts: unknown) => callGatewayStatusProbe(opts),
})); }));
vi.mock("./restart-health.js", () => ({
inspectGatewayRestart: (opts: unknown) => inspectGatewayRestart(opts),
}));
const { gatherDaemonStatus } = await import("./status.gather.js"); const { gatherDaemonStatus } = await import("./status.gather.js");
describe("gatherDaemonStatus", () => { describe("gatherDaemonStatus", () => {
@ -139,6 +152,7 @@ describe("gatherDaemonStatus", () => {
delete process.env.DAEMON_GATEWAY_PASSWORD; delete process.env.DAEMON_GATEWAY_PASSWORD;
callGatewayStatusProbe.mockClear(); callGatewayStatusProbe.mockClear();
loadGatewayTlsRuntime.mockClear(); loadGatewayTlsRuntime.mockClear();
inspectGatewayRestart.mockClear();
daemonLoadedConfig = { daemonLoadedConfig = {
gateway: { gateway: {
bind: "lan", bind: "lan",
@ -362,4 +376,34 @@ describe("gatherDaemonStatus", () => {
expect(callGatewayStatusProbe).not.toHaveBeenCalled(); expect(callGatewayStatusProbe).not.toHaveBeenCalled();
expect(status.rpc).toBeUndefined(); expect(status.rpc).toBeUndefined();
}); });
it("surfaces stale gateway listener pids from restart health inspection", async () => {
inspectGatewayRestart.mockResolvedValueOnce({
runtime: { status: "running", pid: 8000 },
portUsage: {
port: 19001,
status: "busy",
listeners: [{ pid: 9000, ppid: 8999, commandLine: "openclaw-gateway" }],
hints: [],
},
healthy: false,
staleGatewayPids: [9000],
});
const status = await gatherDaemonStatus({
rpc: {},
probe: true,
deep: false,
});
expect(inspectGatewayRestart).toHaveBeenCalledWith(
expect.objectContaining({
port: 19001,
}),
);
expect(status.health).toEqual({
healthy: false,
staleGatewayPids: [9000],
});
});
}); });

View File

@ -29,6 +29,7 @@ import {
import { pickPrimaryTailnetIPv4 } from "../../infra/tailnet.js"; import { pickPrimaryTailnetIPv4 } from "../../infra/tailnet.js";
import { loadGatewayTlsRuntime } from "../../infra/tls/gateway.js"; import { loadGatewayTlsRuntime } from "../../infra/tls/gateway.js";
import { probeGatewayStatus } from "./probe.js"; import { probeGatewayStatus } from "./probe.js";
import { inspectGatewayRestart } from "./restart-health.js";
import { normalizeListenerAddress, parsePortFromArgs, pickProbeHostForBind } from "./shared.js"; import { normalizeListenerAddress, parsePortFromArgs, pickProbeHostForBind } from "./shared.js";
import type { GatewayRpcOpts } from "./types.js"; import type { GatewayRpcOpts } from "./types.js";
@ -112,6 +113,10 @@ export type DaemonStatus = {
error?: string; error?: string;
url?: string; url?: string;
}; };
health?: {
healthy: boolean;
staleGatewayPids: number[];
};
extraServices: Array<{ label: string; detail: string; scope: string }>; extraServices: Array<{ label: string; detail: string; scope: string }>;
}; };
@ -331,6 +336,14 @@ export async function gatherDaemonStatus(
configPath: daemonConfigSummary.path, configPath: daemonConfigSummary.path,
}) })
: undefined; : undefined;
const health =
opts.probe && loaded
? await inspectGatewayRestart({
service,
port: daemonPort,
env: serviceEnv,
}).catch(() => undefined)
: undefined;
let lastError: string | undefined; let lastError: string | undefined;
if (loaded && runtime?.status === "running" && portStatus && portStatus.status !== "busy") { if (loaded && runtime?.status === "running" && portStatus && portStatus.status !== "busy") {
@ -357,6 +370,14 @@ export async function gatherDaemonStatus(
...(portCliStatus ? { portCli: portCliStatus } : {}), ...(portCliStatus ? { portCli: portCliStatus } : {}),
lastError, lastError,
...(rpc ? { rpc: { ...rpc, url: gateway.probeUrl } } : {}), ...(rpc ? { rpc: { ...rpc, url: gateway.probeUrl } } : {}),
...(health
? {
health: {
healthy: health.healthy,
staleGatewayPids: health.staleGatewayPids,
},
}
: {}),
extraServices, extraServices,
}; };
} }

View File

@ -0,0 +1,116 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const runtime = vi.hoisted(() => ({
log: vi.fn<(line: string) => void>(),
error: vi.fn<(line: string) => void>(),
}));
vi.mock("../../runtime.js", () => ({
defaultRuntime: runtime,
}));
vi.mock("../../terminal/theme.js", () => ({
colorize: (_rich: boolean, _theme: unknown, text: string) => text,
}));
vi.mock("../../commands/onboard-helpers.js", () => ({
resolveControlUiLinks: () => ({ httpUrl: "http://127.0.0.1:18789" }),
}));
vi.mock("../../daemon/inspect.js", () => ({
renderGatewayServiceCleanupHints: () => [],
}));
vi.mock("../../daemon/launchd.js", () => ({
resolveGatewayLogPaths: () => ({
stdoutPath: "/tmp/gateway.out.log",
stderrPath: "/tmp/gateway.err.log",
}),
}));
vi.mock("../../daemon/systemd-hints.js", () => ({
isSystemdUnavailableDetail: () => false,
renderSystemdUnavailableHints: () => [],
}));
vi.mock("../../infra/wsl.js", () => ({
isWSLEnv: () => false,
}));
vi.mock("../../logging.js", () => ({
getResolvedLoggerSettings: () => ({ file: "/tmp/openclaw.log" }),
}));
vi.mock("./shared.js", () => ({
createCliStatusTextStyles: () => ({
rich: false,
label: (text: string) => text,
accent: (text: string) => text,
infoText: (text: string) => text,
okText: (text: string) => text,
warnText: (text: string) => text,
errorText: (text: string) => text,
}),
filterDaemonEnv: () => ({}),
formatRuntimeStatus: () => "running (pid 8000)",
resolveRuntimeStatusColor: () => "",
renderRuntimeHints: () => [],
safeDaemonEnv: () => [],
}));
vi.mock("./status.gather.js", () => ({
renderPortDiagnosticsForCli: () => [],
resolvePortListeningAddresses: () => ["127.0.0.1:18789"],
}));
const { printDaemonStatus } = await import("./status.print.js");
describe("printDaemonStatus", () => {
beforeEach(() => {
runtime.log.mockReset();
runtime.error.mockReset();
});
it("prints stale gateway pid guidance when runtime does not own the listener", () => {
printDaemonStatus(
{
service: {
label: "LaunchAgent",
loaded: true,
loadedText: "loaded",
notLoadedText: "not loaded",
runtime: { status: "running", pid: 8000 },
},
gateway: {
bindMode: "loopback",
bindHost: "127.0.0.1",
port: 18789,
portSource: "env/config",
probeUrl: "ws://127.0.0.1:18789",
},
port: {
port: 18789,
status: "busy",
listeners: [{ pid: 9000, ppid: 8999, address: "127.0.0.1:18789" }],
hints: [],
},
rpc: {
ok: false,
error: "gateway closed (1006 abnormal closure (no close frame))",
url: "ws://127.0.0.1:18789",
},
health: {
healthy: false,
staleGatewayPids: [9000],
},
extraServices: [],
},
{ json: false },
);
expect(runtime.error).toHaveBeenCalledWith(
expect.stringContaining("Gateway runtime PID does not own the listening port"),
);
expect(runtime.error).toHaveBeenCalledWith(expect.stringContaining("openclaw gateway restart"));
});
});

View File

@ -194,6 +194,25 @@ export function printDaemonStatus(status: DaemonStatus, opts: { json: boolean })
spacer(); spacer();
} }
if (
status.health &&
status.health.staleGatewayPids.length > 0 &&
service.runtime?.status === "running" &&
typeof service.runtime.pid === "number"
) {
defaultRuntime.error(
errorText(
`Gateway runtime PID does not own the listening port. Other gateway process(es) are listening: ${status.health.staleGatewayPids.join(", ")}`,
),
);
defaultRuntime.error(
errorText(
`Fix: run ${formatCliCommand("openclaw gateway restart")} and re-check with ${formatCliCommand("openclaw gateway status --deep")}.`,
),
);
spacer();
}
const systemdUnavailable = const systemdUnavailable =
process.platform === "linux" && isSystemdUnavailableDetail(service.runtime?.detail); process.platform === "linux" && isSystemdUnavailableDetail(service.runtime?.detail);
if (systemdUnavailable) { if (systemdUnavailable) {

View File

@ -29,6 +29,9 @@ const launchdRestartHandoffState = vi.hoisted(() => ({
isCurrentProcessLaunchdServiceLabel: vi.fn<(label: string) => boolean>(() => false), isCurrentProcessLaunchdServiceLabel: vi.fn<(label: string) => boolean>(() => false),
scheduleDetachedLaunchdRestartHandoff: vi.fn((_params: unknown) => ({ ok: true, pid: 7331 })), scheduleDetachedLaunchdRestartHandoff: vi.fn((_params: unknown) => ({ ok: true, pid: 7331 })),
})); }));
const cleanStaleGatewayProcessesSync = vi.hoisted(() =>
vi.fn<(port?: number) => number[]>(() => []),
);
const defaultProgramArguments = ["node", "-e", "process.exit(0)"]; const defaultProgramArguments = ["node", "-e", "process.exit(0)"];
function expectLaunchctlEnableBootstrapOrder(env: Record<string, string | undefined>) { function expectLaunchctlEnableBootstrapOrder(env: Record<string, string | undefined>) {
@ -89,6 +92,10 @@ vi.mock("./launchd-restart-handoff.js", () => ({
launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff(params), launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff(params),
})); }));
vi.mock("../infra/restart-stale-pids.js", () => ({
cleanStaleGatewayProcessesSync: (port?: number) => cleanStaleGatewayProcessesSync(port),
}));
vi.mock("node:fs/promises", async (importOriginal) => { vi.mock("node:fs/promises", async (importOriginal) => {
const actual = await importOriginal<typeof import("node:fs/promises")>(); const actual = await importOriginal<typeof import("node:fs/promises")>();
const wrapped = { const wrapped = {
@ -151,6 +158,8 @@ beforeEach(() => {
state.dirModes.clear(); state.dirModes.clear();
state.files.clear(); state.files.clear();
state.fileModes.clear(); state.fileModes.clear();
cleanStaleGatewayProcessesSync.mockReset();
cleanStaleGatewayProcessesSync.mockReturnValue([]);
launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReset(); launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReset();
launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReturnValue(false); launchdRestartHandoffState.isCurrentProcessLaunchdServiceLabel.mockReturnValue(false);
launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff.mockReset(); launchdRestartHandoffState.scheduleDetachedLaunchdRestartHandoff.mockReset();
@ -328,7 +337,10 @@ describe("launchd install", () => {
}); });
it("restarts LaunchAgent with kickstart and no bootout", async () => { it("restarts LaunchAgent with kickstart and no bootout", async () => {
const env = createDefaultLaunchdEnv(); const env = {
...createDefaultLaunchdEnv(),
OPENCLAW_GATEWAY_PORT: "18789",
};
const result = await restartLaunchAgent({ const result = await restartLaunchAgent({
env, env,
stdout: new PassThrough(), stdout: new PassThrough(),
@ -338,11 +350,38 @@ describe("launchd install", () => {
const label = "ai.openclaw.gateway"; const label = "ai.openclaw.gateway";
const serviceId = `${domain}/${label}`; const serviceId = `${domain}/${label}`;
expect(result).toEqual({ outcome: "completed" }); expect(result).toEqual({ outcome: "completed" });
expect(cleanStaleGatewayProcessesSync).toHaveBeenCalledWith(18789);
expect(state.launchctlCalls).toContainEqual(["kickstart", "-k", serviceId]); expect(state.launchctlCalls).toContainEqual(["kickstart", "-k", serviceId]);
expect(state.launchctlCalls.some((call) => call[0] === "bootout")).toBe(false); expect(state.launchctlCalls.some((call) => call[0] === "bootout")).toBe(false);
expect(state.launchctlCalls.some((call) => call[0] === "bootstrap")).toBe(false); expect(state.launchctlCalls.some((call) => call[0] === "bootstrap")).toBe(false);
}); });
it("uses the configured gateway port for stale cleanup", async () => {
const env = {
...createDefaultLaunchdEnv(),
OPENCLAW_GATEWAY_PORT: "19001",
};
await restartLaunchAgent({
env,
stdout: new PassThrough(),
});
expect(cleanStaleGatewayProcessesSync).toHaveBeenCalledWith(19001);
});
it("skips stale cleanup when no explicit launch agent port can be resolved", async () => {
const env = createDefaultLaunchdEnv();
state.files.clear();
await restartLaunchAgent({
env,
stdout: new PassThrough(),
});
expect(cleanStaleGatewayProcessesSync).not.toHaveBeenCalled();
});
it("falls back to bootstrap when kickstart cannot find the service", async () => { it("falls back to bootstrap when kickstart cannot find the service", async () => {
const env = createDefaultLaunchdEnv(); const env = createDefaultLaunchdEnv();
state.kickstartError = "Could not find service"; state.kickstartError = "Could not find service";

View File

@ -1,6 +1,7 @@
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js"; import { parseStrictInteger, parseStrictPositiveInteger } from "../infra/parse-finite-number.js";
import { cleanStaleGatewayProcessesSync } from "../infra/restart-stale-pids.js";
import { import {
GATEWAY_LAUNCH_AGENT_LABEL, GATEWAY_LAUNCH_AGENT_LABEL,
resolveGatewayServiceDescription, resolveGatewayServiceDescription,
@ -113,6 +114,44 @@ async function execLaunchctl(
return await execFileUtf8(file, fileArgs, isWindows ? { windowsHide: true } : {}); return await execFileUtf8(file, fileArgs, isWindows ? { windowsHide: true } : {});
} }
function parseGatewayPortFromProgramArguments(
programArguments: string[] | undefined,
): number | null {
if (!Array.isArray(programArguments) || programArguments.length === 0) {
return null;
}
for (let index = 0; index < programArguments.length; index += 1) {
const current = programArguments[index]?.trim();
if (!current) {
continue;
}
if (current === "--port") {
const next = parseStrictPositiveInteger(programArguments[index + 1] ?? "");
if (next !== undefined) {
return next;
}
continue;
}
if (current.startsWith("--port=")) {
const value = parseStrictPositiveInteger(current.slice("--port=".length));
if (value !== undefined) {
return value;
}
}
}
return null;
}
async function resolveLaunchAgentGatewayPort(env: GatewayServiceEnv): Promise<number | null> {
const command = await readLaunchAgentProgramArguments(env).catch(() => null);
const fromArgs = parseGatewayPortFromProgramArguments(command?.programArguments);
if (fromArgs !== null) {
return fromArgs;
}
const fromEnv = parseStrictPositiveInteger(env.OPENCLAW_GATEWAY_PORT ?? "");
return fromEnv ?? null;
}
function resolveGuiDomain(): string { function resolveGuiDomain(): string {
if (typeof process.getuid !== "function") { if (typeof process.getuid !== "function") {
return "gui/501"; return "gui/501";
@ -514,6 +553,11 @@ export async function restartLaunchAgent({
return { outcome: "scheduled" }; return { outcome: "scheduled" };
} }
const cleanupPort = await resolveLaunchAgentGatewayPort(serviceEnv);
if (cleanupPort !== null) {
cleanStaleGatewayProcessesSync(cleanupPort);
}
const start = await execLaunchctl(["kickstart", "-k", serviceTarget]); const start = await execLaunchctl(["kickstart", "-k", serviceTarget]);
if (start.code === 0) { if (start.code === 0) {
writeLaunchAgentActionLine(stdout, "Restarted LaunchAgent", serviceTarget); writeLaunchAgentActionLine(stdout, "Restarted LaunchAgent", serviceTarget);

View File

@ -1,5 +1,8 @@
import os from "node:os";
import path from "node:path";
import { afterAll, beforeAll, describe, expect, test } from "vitest"; import { afterAll, beforeAll, describe, expect, test } from "vitest";
import { import {
BACKEND_GATEWAY_CLIENT,
connectReq, connectReq,
CONTROL_UI_CLIENT, CONTROL_UI_CLIENT,
ConnectErrorDetailCodes, ConnectErrorDetailCodes,
@ -144,6 +147,50 @@ describe("gateway auth compatibility baseline", () => {
ws.close(); ws.close();
} }
}); });
test("keeps local backend device-token reconnects out of pairing", async () => {
const identityPath = path.join(
os.tmpdir(),
`openclaw-backend-device-${process.pid}-${port}.json`,
);
const { loadOrCreateDeviceIdentity, publicKeyRawBase64UrlFromPem } =
await import("../infra/device-identity.js");
const { approveDevicePairing, requestDevicePairing, rotateDeviceToken } =
await import("../infra/device-pairing.js");
const identity = loadOrCreateDeviceIdentity(identityPath);
const pending = await requestDevicePairing({
deviceId: identity.deviceId,
publicKey: publicKeyRawBase64UrlFromPem(identity.publicKeyPem),
clientId: BACKEND_GATEWAY_CLIENT.id,
clientMode: BACKEND_GATEWAY_CLIENT.mode,
role: "operator",
scopes: ["operator.admin"],
});
await approveDevicePairing(pending.request.requestId);
const rotated = await rotateDeviceToken({
deviceId: identity.deviceId,
role: "operator",
scopes: ["operator.admin"],
});
expect(rotated?.token).toBeTruthy();
const ws = await openWs(port);
try {
const res = await connectReq(ws, {
skipDefaultAuth: true,
client: { ...BACKEND_GATEWAY_CLIENT },
deviceIdentityPath: identityPath,
deviceToken: String(rotated?.token ?? ""),
scopes: ["operator.admin"],
});
expect(res.ok).toBe(true);
expect((res.payload as { type?: string } | undefined)?.type).toBe("hello-ok");
} finally {
ws.close();
}
});
}); });
describe("password mode", () => { describe("password mode", () => {

View File

@ -89,7 +89,7 @@ describe("handshake auth helpers", () => {
).toBe(false); ).toBe(false);
}); });
it("skips backend self-pairing only for local shared-secret backend clients", () => { it("skips backend self-pairing for local trusted backend clients", () => {
const connectParams = { const connectParams = {
client: { client: {
id: GATEWAY_CLIENT_IDS.GATEWAY_CLIENT, id: GATEWAY_CLIENT_IDS.GATEWAY_CLIENT,
@ -106,6 +106,15 @@ describe("handshake auth helpers", () => {
authMethod: "token", authMethod: "token",
}), }),
).toBe(true); ).toBe(true);
expect(
shouldSkipBackendSelfPairing({
connectParams,
isLocalClient: true,
hasBrowserOriginHeader: false,
sharedAuthOk: false,
authMethod: "device-token",
}),
).toBe(true);
expect( expect(
shouldSkipBackendSelfPairing({ shouldSkipBackendSelfPairing({
connectParams, connectParams,

View File

@ -74,11 +74,14 @@ export function shouldSkipBackendSelfPairing(params: {
return false; return false;
} }
const usesSharedSecretAuth = params.authMethod === "token" || params.authMethod === "password"; const usesSharedSecretAuth = params.authMethod === "token" || params.authMethod === "password";
const usesDeviceTokenAuth = params.authMethod === "device-token";
// `authMethod === "device-token"` only reaches this helper after the caller
// has already accepted auth (`authOk === true`), so a separate
// `deviceTokenAuthOk` flag would be redundant here.
return ( return (
params.isLocalClient && params.isLocalClient &&
!params.hasBrowserOriginHeader && !params.hasBrowserOriginHeader &&
params.sharedAuthOk && ((params.sharedAuthOk && usesSharedSecretAuth) || usesDeviceTokenAuth)
usesSharedSecretAuth
); );
} }