From fd968bfb2d16abf5232ad29ad7397068e6bc5b03 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 5 Apr 2026 09:01:05 +0100 Subject: [PATCH] fix: recover unloaded macOS launch agents (#43766) --- CHANGELOG.md | 1 + .../lifecycle-core.config-guard.test.ts | 22 +++++ src/cli/daemon-cli/lifecycle-core.test.ts | 57 +++++++++++++ src/cli/daemon-cli/lifecycle-core.ts | 47 ++++++++--- src/cli/daemon-cli/lifecycle.test.ts | 81 ++++++++++++++++++- src/cli/daemon-cli/lifecycle.ts | 30 ++++++- src/daemon/launchd.test.ts | 46 ++++++++++- src/daemon/launchd.ts | 7 +- 8 files changed, 276 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b17a107fdb4..4246acc0d21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,7 @@ Docs: https://docs.openclaw.ai - MS Teams: download inline DM images via Graph API and preserve channel reply threading in proactive fallback. (#52212, #55198) - Agents/Claude CLI: persist explicit `openclaw agent --session-id` runs under a stable session key so follow-ups can reuse the stored CLI binding and resume the same underlying Claude session. - Agents/CLI backends: invalidate stored CLI session reuse when local CLI login state or the selected auth profile credential changes, so relogin and token rotation stop resuming stale sessions. +- Gateway/macOS: recover installed-but-unloaded LaunchAgents during `openclaw gateway start` and `restart`, while still preferring live unmanaged gateways during restart recovery. (#43766) Thanks @HenryC-3. - Auth/failover: persist selected fallback overrides before retrying, shorten `auth_permanent` lockouts, and refresh websocket/shared-auth sessions only when real auth changes occur so retries and secret rotations behave predictably. (#60404, #60323, #60387) - Cron: replay interrupted recurring jobs on the first gateway restart instead of waiting for a second restart. (#60583) Thanks @joelnishanth. - Plugins/media understanding: enable bundled Groq and Deepgram providers by default so configured transcription models work without extra plugin activation config. (#59982) Thanks @yxjsxy. diff --git a/src/cli/daemon-cli/lifecycle-core.config-guard.test.ts b/src/cli/daemon-cli/lifecycle-core.config-guard.test.ts index 59a2926e993..6055623ae4f 100644 --- a/src/cli/daemon-cli/lifecycle-core.config-guard.test.ts +++ b/src/cli/daemon-cli/lifecycle-core.config-guard.test.ts @@ -132,6 +132,28 @@ describe("runServiceStart config pre-flight (#35862)", () => { expect(service.restart).not.toHaveBeenCalled(); }); + it("aborts before not-loaded start recovery when config is invalid", async () => { + const onNotLoaded = vi.fn(async () => ({ + result: "started" as const, + loaded: true, + })); + setConfigSnapshot({ + exists: true, + valid: false, + issues: [{ path: "agents.defaults.pdfModel", message: "Unrecognized key" }], + }); + + await expect( + runServiceStart({ + ...createServiceRunArgs(), + onNotLoaded, + }), + ).rejects.toThrow("__exit__:1"); + + expect(onNotLoaded).not.toHaveBeenCalled(); + expect(service.restart).not.toHaveBeenCalled(); + }); + it("proceeds with start when config is valid", async () => { setConfigSnapshot({ exists: true, valid: true }); diff --git a/src/cli/daemon-cli/lifecycle-core.test.ts b/src/cli/daemon-cli/lifecycle-core.test.ts index 6a0c05b1f1b..a9bcf37d4d3 100644 --- a/src/cli/daemon-cli/lifecycle-core.test.ts +++ b/src/cli/daemon-cli/lifecycle-core.test.ts @@ -218,6 +218,33 @@ describe("runServiceRestart token drift", () => { expect(service.stop).not.toHaveBeenCalled(); }); + it("emits started when a not-loaded start path repairs the service", async () => { + service.isLoaded.mockResolvedValue(false); + + await runServiceStart({ + serviceNoun: "Gateway", + service, + renderStartHints: () => [], + opts: { json: true }, + onNotLoaded: async () => ({ + result: "started", + message: + "Gateway LaunchAgent was installed but not loaded; re-bootstrapped launchd service.", + loaded: true, + }), + }); + + const payload = readJsonLog<{ + result?: string; + message?: string; + service?: { loaded?: boolean }; + }>(); + expect(payload.result).toBe("started"); + expect(payload.message).toContain("re-bootstrapped"); + expect(payload.service?.loaded).toBe(true); + expect(service.restart).not.toHaveBeenCalled(); + }); + it("runs restart health checks after an unmanaged restart signal", async () => { const postRestartCheck = vi.fn(async () => {}); service.isLoaded.mockResolvedValue(false); @@ -242,6 +269,36 @@ describe("runServiceRestart token drift", () => { expect(payload.message).toContain("unmanaged process"); }); + it("emits loaded restart state when launchd repair handles a not-loaded restart", async () => { + const postRestartCheck = vi.fn(async () => {}); + service.isLoaded.mockResolvedValue(false); + + await runServiceRestart({ + serviceNoun: "Gateway", + service, + renderStartHints: () => [], + opts: { json: true }, + onNotLoaded: async () => ({ + result: "restarted", + message: + "Gateway LaunchAgent was installed but not loaded; re-bootstrapped launchd service.", + loaded: true, + }), + postRestartCheck, + }); + + expect(postRestartCheck).toHaveBeenCalledTimes(1); + expect(service.restart).not.toHaveBeenCalled(); + const payload = readJsonLog<{ + result?: string; + message?: string; + service?: { loaded?: boolean }; + }>(); + expect(payload.result).toBe("restarted"); + expect(payload.message).toContain("re-bootstrapped"); + expect(payload.service?.loaded).toBe(true); + }); + it("skips restart health checks when restart is only scheduled", async () => { const postRestartCheck = vi.fn(async () => {}); service.restart.mockResolvedValue({ outcome: "scheduled" }); diff --git a/src/cli/daemon-cli/lifecycle-core.ts b/src/cli/daemon-cli/lifecycle-core.ts index a508e2f45dc..393826ad972 100644 --- a/src/cli/daemon-cli/lifecycle-core.ts +++ b/src/cli/daemon-cli/lifecycle-core.ts @@ -31,9 +31,10 @@ type RestartPostCheckContext = { }; type NotLoadedActionResult = { - result: "stopped" | "restarted"; + result: "started" | "stopped" | "restarted"; message?: string; warnings?: string[]; + loaded?: boolean; }; type NotLoadedActionContext = { @@ -186,17 +187,17 @@ export async function runServiceStart(params: { service: GatewayService; renderStartHints: () => string[]; opts?: DaemonLifecycleOptions; + onNotLoaded?: (ctx: NotLoadedActionContext) => Promise; }) { const json = Boolean(params.opts?.json); const { stdout, emit, fail } = createDaemonActionContext({ action: "start", json }); + const loaded = await resolveServiceLoadedOrFail({ + serviceNoun: params.serviceNoun, + service: params.service, + fail, + }); - if ( - (await resolveServiceLoadedOrFail({ - serviceNoun: params.serviceNoun, - service: params.service, - fail, - })) === null - ) { + if (loaded === null) { return; } // Pre-flight config validation (#35862) — run for both loaded and not-loaded @@ -210,6 +211,28 @@ export async function runServiceStart(params: { return; } } + if (!loaded) { + try { + const handled = await params.onNotLoaded?.({ json, stdout, fail }); + if (handled) { + emit({ + ok: true, + result: handled.result, + message: handled.message, + warnings: handled.warnings, + service: buildDaemonServiceSnapshot(params.service, handled.loaded ?? false), + }); + if (!json && handled.message) { + defaultRuntime.log(handled.message); + } + return; + } + } catch (err) { + const hints = params.renderStartHints(); + fail(`${params.serviceNoun} start failed: ${String(err)}`, hints); + return; + } + } try { const startResult = await startGatewayService(params.service, { env: process.env, stdout }); if (startResult.outcome === "missing-install") { @@ -332,6 +355,7 @@ export async function runServiceRestart(params: { const { stdout, emit, fail } = createDaemonActionContext({ action: "restart", json }); const warnings: string[] = []; let handledNotLoaded: NotLoadedActionResult | null = null; + let recoveredLoadedState: boolean | null = null; const emitScheduledRestart = ( restartStatus: ReturnType, serviceLoaded: boolean, @@ -392,6 +416,7 @@ export async function runServiceRestart(params: { if (handledNotLoaded.warnings?.length) { warnings.push(...handledNotLoaded.warnings); } + recoveredLoadedState = handledNotLoaded.loaded ?? null; } if (loaded && params.checkTokenDrift) { @@ -437,14 +462,14 @@ export async function runServiceRestart(params: { } let restartStatus = describeGatewayServiceRestart(params.serviceNoun, restartResult); if (restartStatus.scheduled) { - return emitScheduledRestart(restartStatus, loaded); + return emitScheduledRestart(restartStatus, loaded || recoveredLoadedState === true); } if (params.postRestartCheck) { const postRestartResult = await params.postRestartCheck({ json, stdout, warnings, fail }); if (postRestartResult) { restartStatus = describeGatewayServiceRestart(params.serviceNoun, postRestartResult); if (restartStatus.scheduled) { - return emitScheduledRestart(restartStatus, loaded); + return emitScheduledRestart(restartStatus, loaded || recoveredLoadedState === true); } } } @@ -455,6 +480,8 @@ export async function runServiceRestart(params: { } catch { restarted = true; } + } else if (recoveredLoadedState !== null) { + restarted = recoveredLoadedState; } emit({ ok: true, diff --git a/src/cli/daemon-cli/lifecycle.test.ts b/src/cli/daemon-cli/lifecycle.test.ts index 0835e77e2e6..25d6a6b3c4b 100644 --- a/src/cli/daemon-cli/lifecycle.test.ts +++ b/src/cli/daemon-cli/lifecycle.test.ts @@ -27,6 +27,7 @@ const service = { restart: vi.fn(), }; +const runServiceStart = vi.fn(); const runServiceRestart = vi.fn(); const runServiceStop = vi.fn(); const waitForGatewayHealthyListener = vi.fn(); @@ -50,6 +51,8 @@ const probeGateway = vi.fn< >(); const isRestartEnabled = vi.fn<(config?: { commands?: unknown }) => boolean>(() => true); const loadConfig = vi.hoisted(() => vi.fn(() => ({}))); +const launchAgentPlistExists = vi.hoisted(() => vi.fn()); +const repairLaunchAgentBootstrap = vi.hoisted(() => vi.fn()); vi.mock("../../config/config.js", () => ({ loadConfig: () => loadConfig(), @@ -81,6 +84,12 @@ vi.mock("../../daemon/service.js", () => ({ resolveGatewayService: () => service, })); +vi.mock("../../daemon/launchd.js", () => ({ + launchAgentPlistExists: (env: Record) => launchAgentPlistExists(env), + repairLaunchAgentBootstrap: (args: { env?: Record }) => + repairLaunchAgentBootstrap(args), +})); + vi.mock("./restart-health.js", () => ({ DEFAULT_RESTART_HEALTH_ATTEMPTS: 120, DEFAULT_RESTART_HEALTH_DELAY_MS: 500, @@ -93,12 +102,13 @@ vi.mock("./restart-health.js", () => ({ vi.mock("./lifecycle-core.js", () => ({ runServiceRestart, - runServiceStart: vi.fn(), + runServiceStart, runServiceStop, runServiceUninstall: vi.fn(), })); describe("runDaemonRestart health checks", () => { + let runDaemonStart: (opts?: { json?: boolean }) => Promise; let runDaemonRestart: (opts?: { json?: boolean }) => Promise; let runDaemonStop: (opts?: { json?: boolean }) => Promise; let envSnapshot: ReturnType; @@ -127,7 +137,7 @@ describe("runDaemonRestart health checks", () => { } beforeAll(async () => { - ({ runDaemonRestart, runDaemonStop } = await import("./lifecycle.js")); + ({ runDaemonStart, runDaemonRestart, runDaemonStop } = await import("./lifecycle.js")); }); beforeEach(() => { @@ -135,6 +145,7 @@ describe("runDaemonRestart health checks", () => { delete process.env.OPENCLAW_CONTAINER_HINT; service.readCommand.mockReset(); service.restart.mockReset(); + runServiceStart.mockReset(); runServiceRestart.mockReset(); runServiceStop.mockReset(); waitForGatewayHealthyListener.mockReset(); @@ -149,12 +160,17 @@ describe("runDaemonRestart health checks", () => { probeGateway.mockReset(); isRestartEnabled.mockReset(); loadConfig.mockReset(); + launchAgentPlistExists.mockReset(); + repairLaunchAgentBootstrap.mockReset(); service.readCommand.mockResolvedValue({ programArguments: ["openclaw", "gateway", "--port", "18789"], environment: {}, }); service.restart.mockResolvedValue({ outcome: "completed" }); + runServiceStart.mockResolvedValue(undefined); + launchAgentPlistExists.mockResolvedValue(false); + repairLaunchAgentBootstrap.mockResolvedValue({ ok: true }); runServiceRestart.mockImplementation(async (params: RestartParams) => { const fail = (message: string, hints?: string[]) => { @@ -175,6 +191,12 @@ describe("runDaemonRestart health checks", () => { healthy: true, portUsage: { port: 18789, status: "busy", listeners: [], hints: [] }, }); + waitForGatewayHealthyRestart.mockResolvedValue({ + healthy: true, + staleGatewayPids: [], + runtime: { status: "running" }, + portUsage: { port: 18789, status: "busy", listeners: [], hints: [] }, + }); probeGateway.mockResolvedValue({ ok: true, configSnapshot: { commands: { restart: true } }, @@ -189,6 +211,19 @@ describe("runDaemonRestart health checks", () => { vi.restoreAllMocks(); }); + it("re-bootstraps an installed LaunchAgent when start finds it not loaded", async () => { + vi.spyOn(process, "platform", "get").mockReturnValue("darwin"); + launchAgentPlistExists.mockResolvedValue(true); + runServiceStart.mockImplementation(async (params: { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + }); + + await runDaemonStart({ json: true }); + + expect(launchAgentPlistExists).toHaveBeenCalledWith(process.env); + expect(repairLaunchAgentBootstrap).toHaveBeenCalledWith({ env: process.env }); + }); + it("kills stale gateway pids and retries restart", async () => { const unhealthy: RestartHealthSnapshot = { healthy: false, @@ -307,6 +342,48 @@ describe("runDaemonRestart health checks", () => { expect(service.restart).not.toHaveBeenCalled(); }); + it("prefers unmanaged restart over launchd repair when a gateway listener is present", async () => { + vi.spyOn(process, "platform", "get").mockReturnValue("darwin"); + launchAgentPlistExists.mockResolvedValue(true); + findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200]); + mockUnmanagedRestart({ runPostRestartCheck: true }); + + await runDaemonRestart({ json: true }); + + expect(signalVerifiedGatewayPidSync).toHaveBeenCalledWith(4200, "SIGUSR1"); + expect(repairLaunchAgentBootstrap).not.toHaveBeenCalled(); + expect(waitForGatewayHealthyListener).toHaveBeenCalledTimes(1); + expect(waitForGatewayHealthyRestart).not.toHaveBeenCalled(); + }); + + it("re-bootstraps an installed LaunchAgent on restart when no unmanaged listener exists", async () => { + vi.spyOn(process, "platform", "get").mockReturnValue("darwin"); + launchAgentPlistExists.mockResolvedValue(true); + findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([]); + runServiceRestart.mockImplementation( + async (params: RestartParams & { onNotLoaded?: () => Promise }) => { + await params.onNotLoaded?.(); + await params.postRestartCheck?.({ + json: Boolean(params.opts?.json), + stdout: process.stdout, + warnings: [], + fail: (message: string) => { + throw new Error(message); + }, + }); + return true; + }, + ); + + await runDaemonRestart({ json: true }); + + expect(repairLaunchAgentBootstrap).toHaveBeenCalledWith({ env: process.env }); + expect(signalVerifiedGatewayPidSync).not.toHaveBeenCalled(); + expect(waitForGatewayHealthyListener).not.toHaveBeenCalled(); + expect(waitForGatewayHealthyRestart).toHaveBeenCalledTimes(1); + expect(service.restart).not.toHaveBeenCalled(); + }); + it("fails unmanaged restart when multiple gateway listeners are present", async () => { findVerifiedGatewayListenerPidsOnPortSync.mockReturnValue([4200, 4300]); mockUnmanagedRestart(); diff --git a/src/cli/daemon-cli/lifecycle.ts b/src/cli/daemon-cli/lifecycle.ts index f0d7efda940..56924d8a9d2 100644 --- a/src/cli/daemon-cli/lifecycle.ts +++ b/src/cli/daemon-cli/lifecycle.ts @@ -1,5 +1,6 @@ import { isRestartEnabled } from "../../config/commands.js"; import { readBestEffortConfig, resolveGatewayPort } from "../../config/config.js"; +import { launchAgentPlistExists, repairLaunchAgentBootstrap } from "../../daemon/launchd.js"; import { resolveGatewayService } from "../../daemon/service.js"; import { probeGateway } from "../../gateway/probe.js"; import { @@ -130,6 +131,28 @@ async function restartGatewayWithoutServiceManager(port: number) { }; } +async function repairLaunchAgentIfInstalled(params: { result: "started" | "restarted" }) { + if (process.platform !== "darwin") { + return null; + } + const serviceEnv = process.env as Record; + const plistExists = await launchAgentPlistExists(serviceEnv).catch(() => false); + if (!plistExists) { + return null; + } + const repaired = await repairLaunchAgentBootstrap({ env: serviceEnv }).catch(() => ({ + ok: false, + })); + if (!repaired.ok) { + return null; + } + return { + result: params.result, + loaded: true, + message: "Gateway LaunchAgent was installed but not loaded; re-bootstrapped launchd service.", + } as const; +} + export async function runDaemonUninstall(opts: DaemonLifecycleOptions = {}) { return await runServiceUninstall({ serviceNoun: "Gateway", @@ -145,6 +168,10 @@ export async function runDaemonStart(opts: DaemonLifecycleOptions = {}) { serviceNoun: "Gateway", service: resolveGatewayService(), renderStartHints: renderGatewayServiceStartHints, + onNotLoaded: + process.platform === "darwin" + ? async () => await repairLaunchAgentIfInstalled({ result: "started" }) + : undefined, opts, }); } @@ -187,8 +214,9 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi const handled = await restartGatewayWithoutServiceManager(restartPort); if (handled) { restartedWithoutServiceManager = true; + return handled; } - return handled; + return await repairLaunchAgentIfInstalled({ result: "restarted" }); }, postRestartCheck: async ({ warnings, fail, stdout }) => { if (restartedWithoutServiceManager) { diff --git a/src/daemon/launchd.test.ts b/src/daemon/launchd.test.ts index c500615e612..c18ac82e1fb 100644 --- a/src/daemon/launchd.test.ts +++ b/src/daemon/launchd.test.ts @@ -18,6 +18,7 @@ const state = vi.hoisted(() => ({ listOutput: "", printOutput: "", bootstrapError: "", + bootstrapCode: 1, kickstartError: "", kickstartFailuresRemaining: 0, dirs: new Set(), @@ -75,7 +76,7 @@ vi.mock("./exec-file.js", () => ({ return { stdout: state.printOutput, stderr: "", code: 0 }; } if (call[0] === "bootstrap" && state.bootstrapError) { - return { stdout: "", stderr: state.bootstrapError, code: 1 }; + return { stdout: "", stderr: state.bootstrapError, code: state.bootstrapCode }; } if (call[0] === "kickstart" && state.kickstartError && state.kickstartFailuresRemaining > 0) { state.kickstartFailuresRemaining -= 1; @@ -152,6 +153,7 @@ beforeEach(() => { state.listOutput = ""; state.printOutput = ""; state.bootstrapError = ""; + state.bootstrapCode = 1; state.kickstartError = ""; state.kickstartFailuresRemaining = 0; state.dirs.clear(); @@ -255,6 +257,48 @@ describe("launchd bootstrap repair", () => { expect(kickstartIndex).toBeGreaterThanOrEqual(0); expect(bootstrapIndex).toBeLessThan(kickstartIndex); }); + + it("treats bootstrap exit 130 as success", async () => { + state.bootstrapError = "Service already loaded"; + state.bootstrapCode = 130; + const env: Record = { + HOME: "/Users/test", + OPENCLAW_PROFILE: "default", + }; + + const repair = await repairLaunchAgentBootstrap({ env }); + + expect(repair.ok).toBe(true); + expect(state.launchctlCalls.filter((call) => call[0] === "kickstart")).toHaveLength(1); + }); + + it("treats 'already exists in domain' bootstrap failures as success", async () => { + state.bootstrapError = + "Could not bootstrap service: 5: Input/output error: already exists in domain for gui/501"; + const env: Record = { + HOME: "/Users/test", + OPENCLAW_PROFILE: "default", + }; + + const repair = await repairLaunchAgentBootstrap({ env }); + + expect(repair.ok).toBe(true); + expect(state.launchctlCalls.filter((call) => call[0] === "kickstart")).toHaveLength(1); + }); + + it("keeps genuine bootstrap failures as failures", async () => { + state.bootstrapError = "Could not find specified service"; + const env: Record = { + HOME: "/Users/test", + OPENCLAW_PROFILE: "default", + }; + + const repair = await repairLaunchAgentBootstrap({ env }); + + expect(repair.ok).toBe(false); + expect(repair.detail).toContain("Could not find specified service"); + expect(state.launchctlCalls.some((call) => call[0] === "kickstart")).toBe(false); + }); }); describe("launchd install", () => { diff --git a/src/daemon/launchd.ts b/src/daemon/launchd.ts index 98252296550..7a11db70b7a 100644 --- a/src/daemon/launchd.ts +++ b/src/daemon/launchd.ts @@ -325,7 +325,12 @@ export async function repairLaunchAgentBootstrap(args: { await execLaunchctl(["enable", `${domain}/${label}`]); const boot = await execLaunchctl(["bootstrap", domain, plistPath]); if (boot.code !== 0) { - return { ok: false, detail: (boot.stderr || boot.stdout).trim() || undefined }; + const detail = (boot.stderr || boot.stdout).trim(); + const normalized = detail.toLowerCase(); + const alreadyLoaded = boot.code === 130 || normalized.includes("already exists in domain"); + if (!alreadyLoaded) { + return { ok: false, detail: detail || undefined }; + } } const kick = await execLaunchctl(["kickstart", "-k", `${domain}/${label}`]); if (kick.code !== 0) {