refactor: share stale pid polling fixtures

This commit is contained in:
Peter Steinberger 2026-03-13 20:36:47 +00:00
parent bf631b5872
commit 8c21284c1c
1 changed files with 85 additions and 181 deletions

View File

@ -42,6 +42,51 @@ function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string {
return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n"; return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n";
} }
type MockLsofResult = {
error: Error | null;
status: number | null;
stdout: string;
stderr: string;
};
function createLsofResult(overrides: Partial<MockLsofResult> = {}): MockLsofResult {
return {
error: null,
status: 0,
stdout: "",
stderr: "",
...overrides,
};
}
function createOpenClawBusyResult(pid: number, overrides: Partial<MockLsofResult> = {}) {
return createLsofResult({
stdout: lsofOutput([{ pid, cmd: "openclaw-gateway" }]),
...overrides,
});
}
function createErrnoResult(code: string, message: string) {
const error = new Error(message) as NodeJS.ErrnoException;
error.code = code;
return createLsofResult({ error, status: null });
}
function installInitialBusyPoll(
stalePid: number,
resolvePoll: (call: number) => MockLsofResult,
): () => number {
let call = 0;
mockSpawnSync.mockImplementation(() => {
call += 1;
if (call === 1) {
return createOpenClawBusyResult(stalePid);
}
return resolvePoll(call);
});
return () => call;
}
describe.skipIf(isWindows)("restart-stale-pids", () => { describe.skipIf(isWindows)("restart-stale-pids", () => {
beforeEach(() => { beforeEach(() => {
mockSpawnSync.mockReset(); mockSpawnSync.mockReset();
@ -201,20 +246,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// lsof exits with status 1 when no matching processes are found — this is // lsof exits with status 1 when no matching processes are found — this is
// the canonical "port is free" signal, not an error. // the canonical "port is free" signal, not an error.
const stalePid = process.pid + 500; const stalePid = process.pid + 500;
let call = 0; installInitialBusyPoll(stalePid, () => createLsofResult({ status: 1 }));
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
// Poll returns status 1 — no listeners
return { error: null, status: 1, stdout: "", stderr: "" };
});
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
// Should complete cleanly (port reported free on status 1) // Should complete cleanly (port reported free on status 1)
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
@ -225,27 +257,17 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// bad flag, runtime error) must not be mapped to free:true. They are // bad flag, runtime error) must not be mapped to free:true. They are
// inconclusive and should keep the polling loop running until budget expires. // inconclusive and should keep the polling loop running until budget expires.
const stalePid = process.pid + 501; const stalePid = process.pid + 501;
let call = 0;
const events: string[] = []; const events: string[] = [];
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
events.push("initial-find"); events.push("initial-find");
return { installInitialBusyPoll(stalePid, (call) => {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
if (call === 2) { if (call === 2) {
// Permission/runtime error — status 2, should NOT be treated as free // Permission/runtime error — status 2, should NOT be treated as free
events.push("error-poll"); events.push("error-poll");
return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" }; return createLsofResult({ status: 2, stderr: "lsof: permission denied" });
} }
// Eventually port is free // Eventually port is free
events.push("free-poll"); events.push("free-poll");
return { error: null, status: 1, stdout: "", stderr: "" }; return createLsofResult({ status: 1 });
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
cleanStaleGatewayProcessesSync(); cleanStaleGatewayProcessesSync();
@ -263,29 +285,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// The fix: pollPortOnce now parses res.stdout directly from the first // The fix: pollPortOnce now parses res.stdout directly from the first
// spawnSync call. Exactly ONE lsof invocation per poll cycle. // spawnSync call. Exactly ONE lsof invocation per poll cycle.
const stalePid = process.pid + 400; const stalePid = process.pid + 400;
let spawnCount = 0; const getCallCount = installInitialBusyPoll(stalePid, (call) => {
mockSpawnSync.mockImplementation(() => { if (call === 2) {
spawnCount++;
if (spawnCount === 1) {
// Initial findGatewayPidsOnPortSync — returns stale pid
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
if (spawnCount === 2) {
// First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again) // First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again)
return { return createOpenClawBusyResult(stalePid);
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
} }
// Port free on third call // Port free on third call
return { error: null, status: 0, stdout: "", stderr: "" }; return createLsofResult();
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
@ -294,7 +300,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// If pollPortOnce made a second lsof call internally, spawnCount would // If pollPortOnce made a second lsof call internally, spawnCount would
// be at least 4 (initial + 2 polls each doubled). With the fix, each poll // be at least 4 (initial + 2 polls each doubled). With the fix, each poll
// is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3. // is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3.
expect(spawnCount).toBe(3); expect(getCallCount()).toBe(3);
}); });
it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => { it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => {
@ -302,34 +308,21 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// lsof can exit 1 AND still emit output for processes it could read. // lsof can exit 1 AND still emit output for processes it could read.
// status 1 + non-empty openclaw stdout must not be treated as port-free. // status 1 + non-empty openclaw stdout must not be treated as port-free.
const stalePid = process.pid + 601; const stalePid = process.pid + 601;
let call = 0; const getCallCount = installInitialBusyPoll(stalePid, (call) => {
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
// Initial scan: finds stale pid
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
if (call === 2) { if (call === 2) {
// status 1 + openclaw pid in stdout — container-restricted lsof reports partial results // status 1 + openclaw pid in stdout — container-restricted lsof reports partial results
return { return createOpenClawBusyResult(stalePid, {
error: null,
status: 1, status: 1,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "lsof: WARNING: can't stat() fuse", stderr: "lsof: WARNING: can't stat() fuse",
}; });
} }
// Third poll: port is genuinely free // Third poll: port is genuinely free
return { error: null, status: 1, stdout: "", stderr: "" }; return createLsofResult({ status: 1 });
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
cleanStaleGatewayProcessesSync(); cleanStaleGatewayProcessesSync();
// Poll 2 returned busy (not free), so we must have polled at least 3 times // Poll 2 returned busy (not free), so we must have polled at least 3 times
expect(call).toBeGreaterThanOrEqual(3); expect(getCallCount()).toBeGreaterThanOrEqual(3);
}); });
it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => { it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => {
@ -382,20 +375,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
it("sends SIGTERM to stale pids and returns them", () => { it("sends SIGTERM to stale pids and returns them", () => {
const stalePid = process.pid + 100; const stalePid = process.pid + 100;
let call = 0; installInitialBusyPoll(stalePid, () => createLsofResult());
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
// waitForPortFreeSync polls: port free immediately
return { error: null, status: 0, stdout: "", stderr: "" };
});
const killSpy = vi.spyOn(process, "kill").mockReturnValue(true); const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
const result = cleanStaleGatewayProcessesSync(); const result = cleanStaleGatewayProcessesSync();
@ -474,24 +454,11 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// immediately on ENOENT rather than spinning the full 2-second budget. // immediately on ENOENT rather than spinning the full 2-second budget.
const stalePid = process.pid + 300; const stalePid = process.pid + 300;
const events: string[] = []; const events: string[] = [];
let call = 0;
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
events.push("initial-find"); events.push("initial-find");
return { installInitialBusyPoll(stalePid, (call) => {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
// Permanent ENOENT — lsof is not installed // Permanent ENOENT — lsof is not installed
events.push(`enoent-poll-${call}`); events.push(`enoent-poll-${call}`);
const err = new Error("lsof not found") as NodeJS.ErrnoException; return createErrnoResult("ENOENT", "lsof not found");
err.code = "ENOENT";
return { error: err, status: null, stdout: "", stderr: "" };
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
@ -506,50 +473,26 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks // EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks
// execution. Like ENOENT/EACCES, this is permanent — retrying is pointless. // execution. Like ENOENT/EACCES, this is permanent — retrying is pointless.
const stalePid = process.pid + 305; const stalePid = process.pid + 305;
let call = 0; const getCallCount = installInitialBusyPoll(stalePid, () =>
mockSpawnSync.mockImplementation(() => { createErrnoResult("EPERM", "lsof eperm"),
call++; );
if (call === 1) {
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
const err = new Error("lsof eperm") as NodeJS.ErrnoException;
err.code = "EPERM";
return { error: err, status: null, stdout: "", stderr: "" };
});
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
// Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES // Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES
expect(call).toBe(2); // 1 initial find + 1 EPERM poll expect(getCallCount()).toBe(2); // 1 initial find + 1 EPERM poll
}); });
it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => { it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => {
// EACCES and EPERM are also permanent conditions — lsof exists but the // EACCES and EPERM are also permanent conditions — lsof exists but the
// process has no permission to run it. No point retrying. // process has no permission to run it. No point retrying.
const stalePid = process.pid + 302; const stalePid = process.pid + 302;
let call = 0; const getCallCount = installInitialBusyPoll(stalePid, () =>
mockSpawnSync.mockImplementation(() => { createErrnoResult("EACCES", "lsof permission denied"),
call++; );
if (call === 1) {
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
const err = new Error("lsof permission denied") as NodeJS.ErrnoException;
err.code = "EACCES";
return { error: err, status: null, stdout: "", stderr: "" };
});
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
// Should have bailed after exactly 1 poll call (the EACCES one) // Should have bailed after exactly 1 poll call (the EACCES one)
expect(call).toBe(2); // 1 initial find + 1 EACCES poll expect(getCallCount()).toBe(2); // 1 initial find + 1 EACCES poll
}); });
it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => { it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => {
@ -561,15 +504,10 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
let fakeNow = 0; let fakeNow = 0;
__testing.setDateNowOverride(() => fakeNow); __testing.setDateNowOverride(() => fakeNow);
mockSpawnSync.mockImplementation(() => { installInitialBusyPoll(stalePid, () => {
// Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline. // Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline.
fakeNow += 2001; fakeNow += 2001;
return { return createOpenClawBusyResult(stalePid);
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
@ -585,24 +523,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would // leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would
// silently recreate the EADDRINUSE race we are fixing. // silently recreate the EADDRINUSE race we are fixing.
const stalePid = process.pid + 304; const stalePid = process.pid + 304;
let call = 0;
const events: string[] = []; const events: string[] = [];
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
// Initial scan: finds stale pid
events.push("initial-find"); events.push("initial-find");
return { installInitialBusyPoll(stalePid, () => {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
// Port is already free on first poll — pid was dead before SIGTERM // Port is already free on first poll — pid was dead before SIGTERM
events.push("poll-free"); events.push("poll-free");
return { error: null, status: 1, stdout: "", stderr: "" }; return createLsofResult({ status: 1 });
}); });
// All SIGTERMs throw ESRCH — pid already gone // All SIGTERMs throw ESRCH — pid already gone
@ -623,27 +550,16 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// would recreate the EADDRINUSE race this PR is designed to prevent. // would recreate the EADDRINUSE race this PR is designed to prevent.
const stalePid = process.pid + 301; const stalePid = process.pid + 301;
const events: string[] = []; const events: string[] = [];
let call = 0;
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
events.push("initial-find"); events.push("initial-find");
return { installInitialBusyPoll(stalePid, (call) => {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
if (call === 2) { if (call === 2) {
// Transient: spawnSync timeout (no ENOENT code) // Transient: spawnSync timeout (no ENOENT code)
events.push("transient-error"); events.push("transient-error");
return { error: new Error("timeout"), status: null, stdout: "", stderr: "" }; return createLsofResult({ error: new Error("timeout"), status: null });
} }
// Port free on the next poll // Port free on the next poll
events.push("port-free"); events.push("port-free");
return { error: null, status: 1, stdout: "", stderr: "" }; return createLsofResult({ status: 1 });
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
@ -739,30 +655,18 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
// the port may be held by an unrelated process. From our perspective // the port may be held by an unrelated process. From our perspective
// (we only kill openclaw pids) it is effectively free. // (we only kill openclaw pids) it is effectively free.
const stalePid = process.pid + 800; const stalePid = process.pid + 800;
let call = 0; const getCallCount = installInitialBusyPoll(stalePid, () => {
mockSpawnSync.mockImplementation(() => {
call++;
if (call === 1) {
return {
error: null,
status: 0,
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
stderr: "",
};
}
// status 1 + non-openclaw output — should be treated as free:true for our purposes // status 1 + non-openclaw output — should be treated as free:true for our purposes
return { return createLsofResult({
error: null,
status: 1, status: 1,
stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]), stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]),
stderr: "", });
};
}); });
vi.spyOn(process, "kill").mockReturnValue(true); vi.spyOn(process, "kill").mockReturnValue(true);
// Should complete cleanly — no openclaw pids in status-1 output → free // Should complete cleanly — no openclaw pids in status-1 output → free
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow(); expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
// Completed in exactly 2 calls (initial find + 1 free poll) // Completed in exactly 2 calls (initial find + 1 free poll)
expect(call).toBe(2); expect(getCallCount()).toBe(2);
}); });
}); });