From 8c21284c1cbc151ab22b5f0aa724da21e5ea6a28 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 13 Mar 2026 20:36:47 +0000 Subject: [PATCH] refactor: share stale pid polling fixtures --- src/infra/restart-stale-pids.test.ts | 266 +++++++++------------------ 1 file changed, 85 insertions(+), 181 deletions(-) diff --git a/src/infra/restart-stale-pids.test.ts b/src/infra/restart-stale-pids.test.ts index f7bf0709d9f..b7589d26e15 100644 --- a/src/infra/restart-stale-pids.test.ts +++ b/src/infra/restart-stale-pids.test.ts @@ -42,6 +42,51 @@ function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string { return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n"; } +type MockLsofResult = { + error: Error | null; + status: number | null; + stdout: string; + stderr: string; +}; + +function createLsofResult(overrides: Partial = {}): MockLsofResult { + return { + error: null, + status: 0, + stdout: "", + stderr: "", + ...overrides, + }; +} + +function createOpenClawBusyResult(pid: number, overrides: Partial = {}) { + return createLsofResult({ + stdout: lsofOutput([{ pid, cmd: "openclaw-gateway" }]), + ...overrides, + }); +} + +function createErrnoResult(code: string, message: string) { + const error = new Error(message) as NodeJS.ErrnoException; + error.code = code; + return createLsofResult({ error, status: null }); +} + +function installInitialBusyPoll( + stalePid: number, + resolvePoll: (call: number) => MockLsofResult, +): () => number { + let call = 0; + mockSpawnSync.mockImplementation(() => { + call += 1; + if (call === 1) { + return createOpenClawBusyResult(stalePid); + } + return resolvePoll(call); + }); + return () => call; +} + describe.skipIf(isWindows)("restart-stale-pids", () => { beforeEach(() => { mockSpawnSync.mockReset(); @@ -201,20 +246,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // lsof exits with status 1 when no matching processes are found — this is // the canonical "port is free" signal, not an error. const stalePid = process.pid + 500; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } - // Poll returns status 1 — no listeners - return { error: null, status: 1, stdout: "", stderr: "" }; - }); + installInitialBusyPoll(stalePid, () => createLsofResult({ status: 1 })); vi.spyOn(process, "kill").mockReturnValue(true); // Should complete cleanly (port reported free on status 1) expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); @@ -225,27 +257,17 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // bad flag, runtime error) must not be mapped to free:true. They are // inconclusive and should keep the polling loop running until budget expires. const stalePid = process.pid + 501; - let call = 0; const events: string[] = []; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - events.push("initial-find"); - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + events.push("initial-find"); + installInitialBusyPoll(stalePid, (call) => { if (call === 2) { // Permission/runtime error — status 2, should NOT be treated as free events.push("error-poll"); - return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" }; + return createLsofResult({ status: 2, stderr: "lsof: permission denied" }); } // Eventually port is free events.push("free-poll"); - return { error: null, status: 1, stdout: "", stderr: "" }; + return createLsofResult({ status: 1 }); }); vi.spyOn(process, "kill").mockReturnValue(true); cleanStaleGatewayProcessesSync(); @@ -263,29 +285,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // The fix: pollPortOnce now parses res.stdout directly from the first // spawnSync call. Exactly ONE lsof invocation per poll cycle. const stalePid = process.pid + 400; - let spawnCount = 0; - mockSpawnSync.mockImplementation(() => { - spawnCount++; - if (spawnCount === 1) { - // Initial findGatewayPidsOnPortSync — returns stale pid - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } - if (spawnCount === 2) { + const getCallCount = installInitialBusyPoll(stalePid, (call) => { + if (call === 2) { // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again) - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; + return createOpenClawBusyResult(stalePid); } // Port free on third call - return { error: null, status: 0, stdout: "", stderr: "" }; + return createLsofResult(); }); vi.spyOn(process, "kill").mockReturnValue(true); @@ -294,7 +300,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // If pollPortOnce made a second lsof call internally, spawnCount would // be at least 4 (initial + 2 polls each doubled). With the fix, each poll // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3. - expect(spawnCount).toBe(3); + expect(getCallCount()).toBe(3); }); it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => { @@ -302,34 +308,21 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // lsof can exit 1 AND still emit output for processes it could read. // status 1 + non-empty openclaw stdout must not be treated as port-free. const stalePid = process.pid + 601; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - // Initial scan: finds stale pid - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + const getCallCount = installInitialBusyPoll(stalePid, (call) => { if (call === 2) { // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results - return { - error: null, + return createOpenClawBusyResult(stalePid, { status: 1, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), stderr: "lsof: WARNING: can't stat() fuse", - }; + }); } // Third poll: port is genuinely free - return { error: null, status: 1, stdout: "", stderr: "" }; + return createLsofResult({ status: 1 }); }); vi.spyOn(process, "kill").mockReturnValue(true); cleanStaleGatewayProcessesSync(); // Poll 2 returned busy (not free), so we must have polled at least 3 times - expect(call).toBeGreaterThanOrEqual(3); + expect(getCallCount()).toBeGreaterThanOrEqual(3); }); it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => { @@ -382,20 +375,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { it("sends SIGTERM to stale pids and returns them", () => { const stalePid = process.pid + 100; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } - // waitForPortFreeSync polls: port free immediately - return { error: null, status: 0, stdout: "", stderr: "" }; - }); + installInitialBusyPoll(stalePid, () => createLsofResult()); const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); const result = cleanStaleGatewayProcessesSync(); @@ -474,24 +454,11 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // immediately on ENOENT rather than spinning the full 2-second budget. const stalePid = process.pid + 300; const events: string[] = []; - let call = 0; - - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - events.push("initial-find"); - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + events.push("initial-find"); + installInitialBusyPoll(stalePid, (call) => { // Permanent ENOENT — lsof is not installed events.push(`enoent-poll-${call}`); - const err = new Error("lsof not found") as NodeJS.ErrnoException; - err.code = "ENOENT"; - return { error: err, status: null, stdout: "", stderr: "" }; + return createErrnoResult("ENOENT", "lsof not found"); }); vi.spyOn(process, "kill").mockReturnValue(true); @@ -506,50 +473,26 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless. const stalePid = process.pid + 305; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } - const err = new Error("lsof eperm") as NodeJS.ErrnoException; - err.code = "EPERM"; - return { error: err, status: null, stdout: "", stderr: "" }; - }); + const getCallCount = installInitialBusyPoll(stalePid, () => + createErrnoResult("EPERM", "lsof eperm"), + ); vi.spyOn(process, "kill").mockReturnValue(true); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES - expect(call).toBe(2); // 1 initial find + 1 EPERM poll + expect(getCallCount()).toBe(2); // 1 initial find + 1 EPERM poll }); it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => { // EACCES and EPERM are also permanent conditions — lsof exists but the // process has no permission to run it. No point retrying. const stalePid = process.pid + 302; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } - const err = new Error("lsof permission denied") as NodeJS.ErrnoException; - err.code = "EACCES"; - return { error: err, status: null, stdout: "", stderr: "" }; - }); + const getCallCount = installInitialBusyPoll(stalePid, () => + createErrnoResult("EACCES", "lsof permission denied"), + ); vi.spyOn(process, "kill").mockReturnValue(true); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); // Should have bailed after exactly 1 poll call (the EACCES one) - expect(call).toBe(2); // 1 initial find + 1 EACCES poll + expect(getCallCount()).toBe(2); // 1 initial find + 1 EACCES poll }); it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => { @@ -561,15 +504,10 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { let fakeNow = 0; __testing.setDateNowOverride(() => fakeNow); - mockSpawnSync.mockImplementation(() => { + installInitialBusyPoll(stalePid, () => { // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline. fakeNow += 2001; - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; + return createOpenClawBusyResult(stalePid); }); vi.spyOn(process, "kill").mockReturnValue(true); @@ -585,24 +523,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would // silently recreate the EADDRINUSE race we are fixing. const stalePid = process.pid + 304; - let call = 0; const events: string[] = []; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - // Initial scan: finds stale pid - events.push("initial-find"); - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + events.push("initial-find"); + installInitialBusyPoll(stalePid, () => { // Port is already free on first poll — pid was dead before SIGTERM events.push("poll-free"); - return { error: null, status: 1, stdout: "", stderr: "" }; + return createLsofResult({ status: 1 }); }); // All SIGTERMs throw ESRCH — pid already gone @@ -623,27 +550,16 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // would recreate the EADDRINUSE race this PR is designed to prevent. const stalePid = process.pid + 301; const events: string[] = []; - let call = 0; - - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - events.push("initial-find"); - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + events.push("initial-find"); + installInitialBusyPoll(stalePid, (call) => { if (call === 2) { // Transient: spawnSync timeout (no ENOENT code) events.push("transient-error"); - return { error: new Error("timeout"), status: null, stdout: "", stderr: "" }; + return createLsofResult({ error: new Error("timeout"), status: null }); } // Port free on the next poll events.push("port-free"); - return { error: null, status: 1, stdout: "", stderr: "" }; + return createLsofResult({ status: 1 }); }); vi.spyOn(process, "kill").mockReturnValue(true); @@ -739,30 +655,18 @@ describe.skipIf(isWindows)("restart-stale-pids", () => { // the port may be held by an unrelated process. From our perspective // (we only kill openclaw pids) it is effectively free. const stalePid = process.pid + 800; - let call = 0; - mockSpawnSync.mockImplementation(() => { - call++; - if (call === 1) { - return { - error: null, - status: 0, - stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]), - stderr: "", - }; - } + const getCallCount = installInitialBusyPoll(stalePid, () => { // status 1 + non-openclaw output — should be treated as free:true for our purposes - return { - error: null, + return createLsofResult({ status: 1, stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]), - stderr: "", - }; + }); }); vi.spyOn(process, "kill").mockReturnValue(true); // Should complete cleanly — no openclaw pids in status-1 output → free expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); // Completed in exactly 2 calls (initial find + 1 free poll) - expect(call).toBe(2); + expect(getCallCount()).toBe(2); }); });