fix: prevent heartbeat scheduler death when runOnce throws

The heartbeat scheduler's run() function had no try/catch around the
runOnce() call. If runOnce() threw an unhandled exception (observed
during session compaction), scheduleNext() was never called and the
timer was never rescheduled — causing heartbeats to silently stop
forever until gateway restart.

Additionally, the early return for 'requests-in-flight' skipped
scheduleNext(), which could strand the timer in edge cases.

Changes:
- Wrap runOnce() in try/catch that logs, advances timer, continues
- Call scheduleNext() before returning on requests-in-flight
- Add tests for both crash recovery and requests-in-flight scheduling

Fixes #14892
This commit is contained in:
exe.dev user 2026-02-12 19:08:20 +00:00 committed by Gustavo Madeira Santana
parent 1f41f7b1e6
commit 961ca3946b
2 changed files with 82 additions and 7 deletions

View File

@ -54,4 +54,67 @@ describe("startHeartbeatRunner", () => {
runner.stop();
});
it("continues scheduling after runOnce throws an unhandled error", async () => {
vi.useFakeTimers();
vi.setSystemTime(new Date(0));
let callCount = 0;
const runSpy = vi.fn().mockImplementation(async () => {
callCount++;
if (callCount === 1) {
// First call throws (simulates crash during session compaction)
throw new Error("session compaction error");
}
return { status: "ran", durationMs: 1 };
});
const runner = startHeartbeatRunner({
cfg: {
agents: { defaults: { heartbeat: { every: "30m" } } },
} as OpenClawConfig,
runOnce: runSpy,
});
// First heartbeat fires and throws
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(1);
// Second heartbeat should still fire (scheduler must not be dead)
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(2);
runner.stop();
});
it("reschedules timer when runOnce returns requests-in-flight", async () => {
vi.useFakeTimers();
vi.setSystemTime(new Date(0));
let callCount = 0;
const runSpy = vi.fn().mockImplementation(async () => {
callCount++;
if (callCount === 1) {
return { status: "skipped", reason: "requests-in-flight" };
}
return { status: "ran", durationMs: 1 };
});
const runner = startHeartbeatRunner({
cfg: {
agents: { defaults: { heartbeat: { every: "30m" } } },
} as OpenClawConfig,
runOnce: runSpy,
});
// First heartbeat returns requests-in-flight
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(1);
// Timer should be rescheduled; next heartbeat should still fire
await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000);
expect(runSpy).toHaveBeenCalledTimes(2);
runner.stop();
});
});

View File

@ -897,14 +897,26 @@ export function startHeartbeatRunner(opts: {
continue;
}
const res = await runOnce({
cfg: state.cfg,
agentId: agent.agentId,
heartbeat: agent.heartbeat,
reason,
deps: { runtime: state.runtime },
});
let res: HeartbeatRunResult;
try {
res = await runOnce({
cfg: state.cfg,
agentId: agent.agentId,
heartbeat: agent.heartbeat,
reason,
deps: { runtime: state.runtime },
});
} catch (err) {
// If runOnce throws (e.g. during session compaction), we must still
// advance the timer and call scheduleNext so heartbeats keep firing.
const errMsg = formatErrorMessage(err);
log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg });
agent.lastRunMs = now;
agent.nextDueMs = now + agent.intervalMs;
continue;
}
if (res.status === "skipped" && res.reason === "requests-in-flight") {
scheduleNext();
return res;
}
if (res.status !== "skipped" || res.reason !== "disabled") {