mirror of https://github.com/openclaw/openclaw.git
refactor: share stale pid polling fixtures
This commit is contained in:
parent
bf631b5872
commit
8c21284c1c
|
|
@ -42,6 +42,51 @@ function lsofOutput(entries: Array<{ pid: number; cmd: string }>): string {
|
|||
return entries.map(({ pid, cmd }) => `p${pid}\nc${cmd}`).join("\n") + "\n";
|
||||
}
|
||||
|
||||
type MockLsofResult = {
|
||||
error: Error | null;
|
||||
status: number | null;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
};
|
||||
|
||||
function createLsofResult(overrides: Partial<MockLsofResult> = {}): MockLsofResult {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function createOpenClawBusyResult(pid: number, overrides: Partial<MockLsofResult> = {}) {
|
||||
return createLsofResult({
|
||||
stdout: lsofOutput([{ pid, cmd: "openclaw-gateway" }]),
|
||||
...overrides,
|
||||
});
|
||||
}
|
||||
|
||||
function createErrnoResult(code: string, message: string) {
|
||||
const error = new Error(message) as NodeJS.ErrnoException;
|
||||
error.code = code;
|
||||
return createLsofResult({ error, status: null });
|
||||
}
|
||||
|
||||
function installInitialBusyPoll(
|
||||
stalePid: number,
|
||||
resolvePoll: (call: number) => MockLsofResult,
|
||||
): () => number {
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call += 1;
|
||||
if (call === 1) {
|
||||
return createOpenClawBusyResult(stalePid);
|
||||
}
|
||||
return resolvePoll(call);
|
||||
});
|
||||
return () => call;
|
||||
}
|
||||
|
||||
describe.skipIf(isWindows)("restart-stale-pids", () => {
|
||||
beforeEach(() => {
|
||||
mockSpawnSync.mockReset();
|
||||
|
|
@ -201,20 +246,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// lsof exits with status 1 when no matching processes are found — this is
|
||||
// the canonical "port is free" signal, not an error.
|
||||
const stalePid = process.pid + 500;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
// Poll returns status 1 — no listeners
|
||||
return { error: null, status: 1, stdout: "", stderr: "" };
|
||||
});
|
||||
installInitialBusyPoll(stalePid, () => createLsofResult({ status: 1 }));
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
// Should complete cleanly (port reported free on status 1)
|
||||
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
|
||||
|
|
@ -225,27 +257,17 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// bad flag, runtime error) must not be mapped to free:true. They are
|
||||
// inconclusive and should keep the polling loop running until budget expires.
|
||||
const stalePid = process.pid + 501;
|
||||
let call = 0;
|
||||
const events: string[] = [];
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
events.push("initial-find");
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
events.push("initial-find");
|
||||
installInitialBusyPoll(stalePid, (call) => {
|
||||
if (call === 2) {
|
||||
// Permission/runtime error — status 2, should NOT be treated as free
|
||||
events.push("error-poll");
|
||||
return { error: null, status: 2, stdout: "", stderr: "lsof: permission denied" };
|
||||
return createLsofResult({ status: 2, stderr: "lsof: permission denied" });
|
||||
}
|
||||
// Eventually port is free
|
||||
events.push("free-poll");
|
||||
return { error: null, status: 1, stdout: "", stderr: "" };
|
||||
return createLsofResult({ status: 1 });
|
||||
});
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
cleanStaleGatewayProcessesSync();
|
||||
|
|
@ -263,29 +285,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// The fix: pollPortOnce now parses res.stdout directly from the first
|
||||
// spawnSync call. Exactly ONE lsof invocation per poll cycle.
|
||||
const stalePid = process.pid + 400;
|
||||
let spawnCount = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
spawnCount++;
|
||||
if (spawnCount === 1) {
|
||||
// Initial findGatewayPidsOnPortSync — returns stale pid
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
if (spawnCount === 2) {
|
||||
const getCallCount = installInitialBusyPoll(stalePid, (call) => {
|
||||
if (call === 2) {
|
||||
// First waitForPortFreeSync poll — status 0, port busy (should parse inline, not spawn again)
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
return createOpenClawBusyResult(stalePid);
|
||||
}
|
||||
// Port free on third call
|
||||
return { error: null, status: 0, stdout: "", stderr: "" };
|
||||
return createLsofResult();
|
||||
});
|
||||
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
|
|
@ -294,7 +300,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// If pollPortOnce made a second lsof call internally, spawnCount would
|
||||
// be at least 4 (initial + 2 polls each doubled). With the fix, each poll
|
||||
// is exactly one spawn: initial(1) + busy-poll(1) + free-poll(1) = 3.
|
||||
expect(spawnCount).toBe(3);
|
||||
expect(getCallCount()).toBe(3);
|
||||
});
|
||||
|
||||
it("lsof status 1 with non-empty openclaw stdout is treated as busy, not free (Linux container edge case)", () => {
|
||||
|
|
@ -302,34 +308,21 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// lsof can exit 1 AND still emit output for processes it could read.
|
||||
// status 1 + non-empty openclaw stdout must not be treated as port-free.
|
||||
const stalePid = process.pid + 601;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
// Initial scan: finds stale pid
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
const getCallCount = installInitialBusyPoll(stalePid, (call) => {
|
||||
if (call === 2) {
|
||||
// status 1 + openclaw pid in stdout — container-restricted lsof reports partial results
|
||||
return {
|
||||
error: null,
|
||||
return createOpenClawBusyResult(stalePid, {
|
||||
status: 1,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "lsof: WARNING: can't stat() fuse",
|
||||
};
|
||||
});
|
||||
}
|
||||
// Third poll: port is genuinely free
|
||||
return { error: null, status: 1, stdout: "", stderr: "" };
|
||||
return createLsofResult({ status: 1 });
|
||||
});
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
cleanStaleGatewayProcessesSync();
|
||||
// Poll 2 returned busy (not free), so we must have polled at least 3 times
|
||||
expect(call).toBeGreaterThanOrEqual(3);
|
||||
expect(getCallCount()).toBeGreaterThanOrEqual(3);
|
||||
});
|
||||
|
||||
it("pollPortOnce outer catch returns { free: null, permanent: false } when resolveLsofCommandSync throws", () => {
|
||||
|
|
@ -382,20 +375,7 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
|
||||
it("sends SIGTERM to stale pids and returns them", () => {
|
||||
const stalePid = process.pid + 100;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
// waitForPortFreeSync polls: port free immediately
|
||||
return { error: null, status: 0, stdout: "", stderr: "" };
|
||||
});
|
||||
installInitialBusyPoll(stalePid, () => createLsofResult());
|
||||
|
||||
const killSpy = vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
const result = cleanStaleGatewayProcessesSync();
|
||||
|
|
@ -474,24 +454,11 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// immediately on ENOENT rather than spinning the full 2-second budget.
|
||||
const stalePid = process.pid + 300;
|
||||
const events: string[] = [];
|
||||
let call = 0;
|
||||
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
events.push("initial-find");
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
events.push("initial-find");
|
||||
installInitialBusyPoll(stalePid, (call) => {
|
||||
// Permanent ENOENT — lsof is not installed
|
||||
events.push(`enoent-poll-${call}`);
|
||||
const err = new Error("lsof not found") as NodeJS.ErrnoException;
|
||||
err.code = "ENOENT";
|
||||
return { error: err, status: null, stdout: "", stderr: "" };
|
||||
return createErrnoResult("ENOENT", "lsof not found");
|
||||
});
|
||||
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
|
|
@ -506,50 +473,26 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// EPERM occurs when lsof exists but a MAC policy (SELinux/AppArmor) blocks
|
||||
// execution. Like ENOENT/EACCES, this is permanent — retrying is pointless.
|
||||
const stalePid = process.pid + 305;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
const err = new Error("lsof eperm") as NodeJS.ErrnoException;
|
||||
err.code = "EPERM";
|
||||
return { error: err, status: null, stdout: "", stderr: "" };
|
||||
});
|
||||
const getCallCount = installInitialBusyPoll(stalePid, () =>
|
||||
createErrnoResult("EPERM", "lsof eperm"),
|
||||
);
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
|
||||
// Must bail after exactly 1 EPERM poll — same as ENOENT/EACCES
|
||||
expect(call).toBe(2); // 1 initial find + 1 EPERM poll
|
||||
expect(getCallCount()).toBe(2); // 1 initial find + 1 EPERM poll
|
||||
});
|
||||
|
||||
it("bails immediately when lsof is permanently unavailable (EACCES) — same as ENOENT", () => {
|
||||
// EACCES and EPERM are also permanent conditions — lsof exists but the
|
||||
// process has no permission to run it. No point retrying.
|
||||
const stalePid = process.pid + 302;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
const err = new Error("lsof permission denied") as NodeJS.ErrnoException;
|
||||
err.code = "EACCES";
|
||||
return { error: err, status: null, stdout: "", stderr: "" };
|
||||
});
|
||||
const getCallCount = installInitialBusyPoll(stalePid, () =>
|
||||
createErrnoResult("EACCES", "lsof permission denied"),
|
||||
);
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
|
||||
// Should have bailed after exactly 1 poll call (the EACCES one)
|
||||
expect(call).toBe(2); // 1 initial find + 1 EACCES poll
|
||||
expect(getCallCount()).toBe(2); // 1 initial find + 1 EACCES poll
|
||||
});
|
||||
|
||||
it("proceeds with warning when polling budget is exhausted — fake clock, no real 2s wait", () => {
|
||||
|
|
@ -561,15 +504,10 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
let fakeNow = 0;
|
||||
__testing.setDateNowOverride(() => fakeNow);
|
||||
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
installInitialBusyPoll(stalePid, () => {
|
||||
// Advance clock by PORT_FREE_TIMEOUT_MS + 1ms on first poll to trip the deadline.
|
||||
fakeNow += 2001;
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
return createOpenClawBusyResult(stalePid);
|
||||
});
|
||||
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
|
|
@ -585,24 +523,13 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// leaving its socket in TIME_WAIT / FIN_WAIT. Skipping the poll would
|
||||
// silently recreate the EADDRINUSE race we are fixing.
|
||||
const stalePid = process.pid + 304;
|
||||
let call = 0;
|
||||
const events: string[] = [];
|
||||
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
// Initial scan: finds stale pid
|
||||
events.push("initial-find");
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
events.push("initial-find");
|
||||
installInitialBusyPoll(stalePid, () => {
|
||||
// Port is already free on first poll — pid was dead before SIGTERM
|
||||
events.push("poll-free");
|
||||
return { error: null, status: 1, stdout: "", stderr: "" };
|
||||
return createLsofResult({ status: 1 });
|
||||
});
|
||||
|
||||
// All SIGTERMs throw ESRCH — pid already gone
|
||||
|
|
@ -623,27 +550,16 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// would recreate the EADDRINUSE race this PR is designed to prevent.
|
||||
const stalePid = process.pid + 301;
|
||||
const events: string[] = [];
|
||||
let call = 0;
|
||||
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
events.push("initial-find");
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
events.push("initial-find");
|
||||
installInitialBusyPoll(stalePid, (call) => {
|
||||
if (call === 2) {
|
||||
// Transient: spawnSync timeout (no ENOENT code)
|
||||
events.push("transient-error");
|
||||
return { error: new Error("timeout"), status: null, stdout: "", stderr: "" };
|
||||
return createLsofResult({ error: new Error("timeout"), status: null });
|
||||
}
|
||||
// Port free on the next poll
|
||||
events.push("port-free");
|
||||
return { error: null, status: 1, stdout: "", stderr: "" };
|
||||
return createLsofResult({ status: 1 });
|
||||
});
|
||||
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
|
|
@ -739,30 +655,18 @@ describe.skipIf(isWindows)("restart-stale-pids", () => {
|
|||
// the port may be held by an unrelated process. From our perspective
|
||||
// (we only kill openclaw pids) it is effectively free.
|
||||
const stalePid = process.pid + 800;
|
||||
let call = 0;
|
||||
mockSpawnSync.mockImplementation(() => {
|
||||
call++;
|
||||
if (call === 1) {
|
||||
return {
|
||||
error: null,
|
||||
status: 0,
|
||||
stdout: lsofOutput([{ pid: stalePid, cmd: "openclaw-gateway" }]),
|
||||
stderr: "",
|
||||
};
|
||||
}
|
||||
const getCallCount = installInitialBusyPoll(stalePid, () => {
|
||||
// status 1 + non-openclaw output — should be treated as free:true for our purposes
|
||||
return {
|
||||
error: null,
|
||||
return createLsofResult({
|
||||
status: 1,
|
||||
stdout: lsofOutput([{ pid: process.pid + 801, cmd: "caddy" }]),
|
||||
stderr: "",
|
||||
};
|
||||
});
|
||||
});
|
||||
vi.spyOn(process, "kill").mockReturnValue(true);
|
||||
// Should complete cleanly — no openclaw pids in status-1 output → free
|
||||
expect(() => cleanStaleGatewayProcessesSync()).not.toThrow();
|
||||
// Completed in exactly 2 calls (initial find + 1 free poll)
|
||||
expect(call).toBe(2);
|
||||
expect(getCallCount()).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue