From 961ca3946b2e75233e81748666eaaea0a6db0231 Mon Sep 17 00:00:00 2001 From: "exe.dev user" Date: Thu, 12 Feb 2026 19:08:20 +0000 Subject: [PATCH] fix: prevent heartbeat scheduler death when runOnce throws MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heartbeat scheduler's run() function had no try/catch around the runOnce() call. If runOnce() threw an unhandled exception (observed during session compaction), scheduleNext() was never called and the timer was never rescheduled — causing heartbeats to silently stop forever until gateway restart. Additionally, the early return for 'requests-in-flight' skipped scheduleNext(), which could strand the timer in edge cases. Changes: - Wrap runOnce() in try/catch that logs, advances timer, continues - Call scheduleNext() before returning on requests-in-flight - Add tests for both crash recovery and requests-in-flight scheduling Fixes #14892 --- src/infra/heartbeat-runner.scheduler.test.ts | 63 ++++++++++++++++++++ src/infra/heartbeat-runner.ts | 26 +++++--- 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/src/infra/heartbeat-runner.scheduler.test.ts b/src/infra/heartbeat-runner.scheduler.test.ts index e95058880a7..e1923371ac0 100644 --- a/src/infra/heartbeat-runner.scheduler.test.ts +++ b/src/infra/heartbeat-runner.scheduler.test.ts @@ -54,4 +54,67 @@ describe("startHeartbeatRunner", () => { runner.stop(); }); + + it("continues scheduling after runOnce throws an unhandled error", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date(0)); + + let callCount = 0; + const runSpy = vi.fn().mockImplementation(async () => { + callCount++; + if (callCount === 1) { + // First call throws (simulates crash during session compaction) + throw new Error("session compaction error"); + } + return { status: "ran", durationMs: 1 }; + }); + + const runner = startHeartbeatRunner({ + cfg: { + agents: { defaults: { heartbeat: { every: "30m" } } }, + } as OpenClawConfig, + runOnce: runSpy, + }); + + // First heartbeat fires and throws + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(1); + + // Second heartbeat should still fire (scheduler must not be dead) + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(2); + + runner.stop(); + }); + + it("reschedules timer when runOnce returns requests-in-flight", async () => { + vi.useFakeTimers(); + vi.setSystemTime(new Date(0)); + + let callCount = 0; + const runSpy = vi.fn().mockImplementation(async () => { + callCount++; + if (callCount === 1) { + return { status: "skipped", reason: "requests-in-flight" }; + } + return { status: "ran", durationMs: 1 }; + }); + + const runner = startHeartbeatRunner({ + cfg: { + agents: { defaults: { heartbeat: { every: "30m" } } }, + } as OpenClawConfig, + runOnce: runSpy, + }); + + // First heartbeat returns requests-in-flight + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(1); + + // Timer should be rescheduled; next heartbeat should still fire + await vi.advanceTimersByTimeAsync(30 * 60_000 + 1_000); + expect(runSpy).toHaveBeenCalledTimes(2); + + runner.stop(); + }); }); diff --git a/src/infra/heartbeat-runner.ts b/src/infra/heartbeat-runner.ts index a51a8ec5636..15d93286a3c 100644 --- a/src/infra/heartbeat-runner.ts +++ b/src/infra/heartbeat-runner.ts @@ -897,14 +897,26 @@ export function startHeartbeatRunner(opts: { continue; } - const res = await runOnce({ - cfg: state.cfg, - agentId: agent.agentId, - heartbeat: agent.heartbeat, - reason, - deps: { runtime: state.runtime }, - }); + let res: HeartbeatRunResult; + try { + res = await runOnce({ + cfg: state.cfg, + agentId: agent.agentId, + heartbeat: agent.heartbeat, + reason, + deps: { runtime: state.runtime }, + }); + } catch (err) { + // If runOnce throws (e.g. during session compaction), we must still + // advance the timer and call scheduleNext so heartbeats keep firing. + const errMsg = formatErrorMessage(err); + log.error(`heartbeat runner: runOnce threw unexpectedly: ${errMsg}`, { error: errMsg }); + agent.lastRunMs = now; + agent.nextDueMs = now + agent.intervalMs; + continue; + } if (res.status === "skipped" && res.reason === "requests-in-flight") { + scheduleNext(); return res; } if (res.status !== "skipped" || res.reason !== "disabled") {