mirror of https://github.com/openclaw/openclaw.git
fix: prevent heartbeat scheduler death when runOnce throws
The heartbeat scheduler's run() function had no try/catch around the runOnce() call. If runOnce() threw an unhandled exception (observed during session compaction), scheduleNext() was never called and the timer was never rescheduled — causing heartbeats to silently stop forever until gateway restart. Additionally, the early return for 'requests-in-flight' skipped scheduleNext(), which could strand the timer in edge cases. Changes: - Wrap runOnce() in try/catch that logs, advances timer, continues - Call scheduleNext() before returning on requests-in-flight - Add tests for both crash recovery and requests-in-flight scheduling Fixes #14892
This commit is contained in:
parent
1f41f7b1e6
commit
961ca3946b
|
|
@ -54,4 +54,67 @@ describe("startHeartbeatRunner", () => {
|
|||
|
||||
runner.stop();
|
||||
});
|
||||
|
||||
it("continues scheduling after runOnce throws an unhandled error", async () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date(0));
|
||||
|
||||
let callCount = 0;
|
||||
const runSpy = vi.fn().mockImplementation(async () => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
// First call throws (simulates crash during session compaction)
|
||||
throw new Error("session compaction error");
|
||||
}
|
||||
return { status: "ran", durationMs: 1 };
|
||||
});
|
||||
|
||||
const runner = startHeartbeatRunner({
|
||||
cfg: {
|
||||
agents: { defaults: { heartbeat: { every: "30m" } } },
|
||||
} as OpenClawConfig,
|
||||
runOnce: runSpy,
|
||||
});
|
||||
|
||||
// First heartbeat fires and throws
|
||||
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
|
||||
expect(runSpy).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Second heartbeat should still fire (scheduler must not be dead)
|
||||
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
|
||||
expect(runSpy).toHaveBeenCalledTimes(2);
|
||||
|
||||
runner.stop();
|
||||
});
|
||||
|
||||
it("reschedules timer when runOnce returns requests-in-flight", async () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date(0));
|
||||
|
||||
let callCount = 0;
|
||||
const runSpy = vi.fn().mockImplementation(async () => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
return { status: "skipped", reason: "requests-in-flight" };
|
||||
}
|
||||
return { status: "ran", durationMs: 1 };
|
||||
});
|
||||
|
||||
const runner = startHeartbeatRunner({
|
||||
cfg: {
|
||||
agents: { defaults: { heartbeat: { every: "30m" } } },
|
||||
} as OpenClawConfig,
|
||||
runOnce: runSpy,
|
||||
});
|
||||
|
||||
// First heartbeat returns requests-in-flight
|
||||
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
|
||||
expect(runSpy).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Timer should be rescheduled; next heartbeat should still fire
|
||||
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
|
||||
expect(runSpy).toHaveBeenCalledTimes(2);
|
||||
|
||||
runner.stop();
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -897,14 +897,26 @@ export function startHeartbeatRunner(opts: {
|
|||
continue;
|
||||
}
|
||||
|
||||
const res = await runOnce({
|
||||
cfg: state.cfg,
|
||||
agentId: agent.agentId,
|
||||
heartbeat: agent.heartbeat,
|
||||
reason,
|
||||
deps: { runtime: state.runtime },
|
||||
});
|
||||
let res: HeartbeatRunResult;
|
||||
try {
|
||||
res = await runOnce({
|
||||
cfg: state.cfg,
|
||||
agentId: agent.agentId,
|
||||
heartbeat: agent.heartbeat,
|
||||
reason,
|
||||
deps: { runtime: state.runtime },
|
||||
});
|
||||
} catch (err) {
|
||||
// If runOnce throws (e.g. during session compaction), we must still
|
||||
// advance the timer and call scheduleNext so heartbeats keep firing.
|
||||
const errMsg = formatErrorMessage(err);
|
||||
log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg });
|
||||
agent.lastRunMs = now;
|
||||
agent.nextDueMs = now + agent.intervalMs;
|
||||
continue;
|
||||
}
|
||||
if (res.status === "skipped" && res.reason === "requests-in-flight") {
|
||||
scheduleNext();
|
||||
return res;
|
||||
}
|
||||
if (res.status !== "skipped" || res.reason !== "disabled") {
|
||||
|
|
|
|||
Loading…
Reference in New Issue