Cron: fix 1/3 timeout on fresh isolated CLI runs (openclaw#30140) thanks @ningding97

Verified: - pnpm build - pnpm check - pnpm test:macmini Co-authored-by: ningding97 <17723822+ningding97@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
2026-03-02 09:34:18 +08:00 · 2026-03-02 09:34:18 +08:00 · ca770622b3
parent 949200d7cb
commit ca770622b3
4 changed files with 170 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -106,6 +106,7 @@ Docs: https://docs.openclaw.ai
 - Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky.
 - Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng.
 - Gateway/Control UI origins: support wildcard `"*"` in `gateway.controlUi.allowedOrigins` for trusted remote access setups. Landed from contributor PR #31088 by @frankekn. Thanks @frankekn.
+- Cron/Isolated CLI timeout ratio: avoid reusing persisted CLI session IDs on fresh isolated cron runs so the fresh watchdog profile is used and jobs do not abort at roughly one-third of configured `timeoutSeconds`. (#30140) Thanks @ningding97.
 - Security/Sandbox media reads: eliminate sandbox media TOCTOU symlink-retarget escapes by enforcing root-scoped boundary-safe reads at attachment/image load time and consolidating shared safe-read helpers across sandbox media callsites. This ships in the next npm release. Thanks @tdjackey for reporting.
 - Security/Subagents sandbox inheritance: block sandboxed sessions from spawning cross-agent subagents that would run unsandboxed, preventing runtime sandbox downgrade via `sessions_spawn agentId`. Thanks @tdjackey for reporting.
 - Security/Workspace safe writes: harden `writeFileWithinRoot` against symlink-retarget TOCTOU races by opening existing files without truncation, creating missing files with exclusive create, deferring truncation until post-open identity+boundary validation, and removing out-of-root create artifacts on blocked races; added regression tests for truncate/create race paths. This ships in the next npm release (`2026.3.1`). Thanks @tdjackey for reporting.
--- a/src/cron/isolated-agent/run.skill-filter.test.ts
+++ b/src/cron/isolated-agent/run.skill-filter.test.ts
@ -95,12 +95,14 @@ vi.mock("../../agents/subagent-announce.js", () => ({
  runSubagentAnnounceFlow: vi.fn().mockResolvedValue(true),
 }));

+const runCliAgentMock = vi.fn();
 vi.mock("../../agents/cli-runner.js", () => ({
-  runCliAgent: vi.fn(),
+  runCliAgent: runCliAgentMock,
 }));

+const getCliSessionIdMock = vi.fn().mockReturnValue(undefined);
 vi.mock("../../agents/cli-session.js", () => ({
-  getCliSessionId: vi.fn().mockReturnValue(undefined),
+  getCliSessionId: getCliSessionIdMock,
  setCliSessionId: vi.fn(),
 }));

@ -494,4 +496,82 @@ describe("runCronIsolatedAgentTurn — skill filter", () => {
      expect(runWithModelFallbackMock).not.toHaveBeenCalled();
    });
  });
+
+  describe("CLI session handoff (issue #29774)", () => {
+    it("does not pass stored cliSessionId on fresh isolated runs (isNewSession=true)", async () => {
+      // Simulate a persisted CLI session ID from a previous run.
+      getCliSessionIdMock.mockReturnValue("prev-cli-session-abc");
+      isCliProviderMock.mockReturnValue(true);
+      runCliAgentMock.mockResolvedValue({
+        payloads: [{ text: "output" }],
+        meta: { agentMeta: { sessionId: "new-cli-session-xyz", usage: { input: 5, output: 10 } } },
+      });
+      // Make runWithModelFallback invoke the run callback so the CLI path executes.
+      runWithModelFallbackMock.mockImplementationOnce(
+        async (params: { run: (provider: string, model: string) => Promise<unknown> }) => {
+          const result = await params.run("claude-cli", "claude-opus-4-6");
+          return { result, provider: "claude-cli", model: "claude-opus-4-6", attempts: [] };
+        },
+      );
+      resolveCronSessionMock.mockReturnValue({
+        storePath: "/tmp/store.json",
+        store: {},
+        sessionEntry: {
+          sessionId: "test-session-fresh",
+          updatedAt: 0,
+          systemSent: false,
+          skillsSnapshot: undefined,
+          // A stored CLI session ID that should NOT be reused on fresh runs.
+          cliSessionIds: { "claude-cli": "prev-cli-session-abc" },
+        },
+        systemSent: false,
+        isNewSession: true,
+      });
+
+      await runCronIsolatedAgentTurn(makeParams());
+
+      expect(runCliAgentMock).toHaveBeenCalledOnce();
+      // Fresh session: cliSessionId must be undefined, not the stored value.
+      expect(runCliAgentMock.mock.calls[0][0]).toHaveProperty("cliSessionId", undefined);
+    });
+
+    it("reuses stored cliSessionId on continuation runs (isNewSession=false)", async () => {
+      getCliSessionIdMock.mockReturnValue("existing-cli-session-def");
+      isCliProviderMock.mockReturnValue(true);
+      runCliAgentMock.mockResolvedValue({
+        payloads: [{ text: "output" }],
+        meta: {
+          agentMeta: { sessionId: "existing-cli-session-def", usage: { input: 5, output: 10 } },
+        },
+      });
+      runWithModelFallbackMock.mockImplementationOnce(
+        async (params: { run: (provider: string, model: string) => Promise<unknown> }) => {
+          const result = await params.run("claude-cli", "claude-opus-4-6");
+          return { result, provider: "claude-cli", model: "claude-opus-4-6", attempts: [] };
+        },
+      );
+      resolveCronSessionMock.mockReturnValue({
+        storePath: "/tmp/store.json",
+        store: {},
+        sessionEntry: {
+          sessionId: "test-session-continuation",
+          updatedAt: 0,
+          systemSent: false,
+          skillsSnapshot: undefined,
+          cliSessionIds: { "claude-cli": "existing-cli-session-def" },
+        },
+        systemSent: false,
+        isNewSession: false,
+      });
+
+      await runCronIsolatedAgentTurn(makeParams());
+
+      expect(runCliAgentMock).toHaveBeenCalledOnce();
+      // Continuation: cliSessionId should be passed through for session resume.
+      expect(runCliAgentMock.mock.calls[0][0]).toHaveProperty(
+        "cliSessionId",
+        "existing-cli-session-def",
+      );
+    });
+  });
 });
--- a/src/cron/isolated-agent/run.ts
+++ b/src/cron/isolated-agent/run.ts
@ -463,7 +463,14 @@ export async function runCronIsolatedAgentTurn(params: {
          throw new Error(abortReason());
        }
        if (isCliProvider(providerOverride, cfgWithAgentDefaults)) {
-          const cliSessionId = getCliSessionId(cronSession.sessionEntry, providerOverride);
+          // Fresh isolated cron sessions must not reuse a stored CLI session ID.
+          // Passing an existing ID activates the resume watchdog profile
+          // (noOutputTimeoutRatio 0.3, maxMs 180 s) instead of the fresh profile
+          // (ratio 0.8, maxMs 600 s), causing jobs to time out at roughly 1/3 of
+          // the configured timeoutSeconds. See: https://github.com/openclaw/openclaw/issues/29774
+          const cliSessionId = cronSession.isNewSession
+            ? undefined
+            : getCliSessionId(cronSession.sessionEntry, providerOverride);
          return runCliAgent({
            sessionId: cronSession.sessionEntry.sessionId,
            sessionKey: agentSessionKey,
--- a/src/cron/service.issue-regressions.test.ts
+++ b/src/cron/service.issue-regressions.test.ts
@ -1513,4 +1513,83 @@ describe("Cron issue regressions", () => {
    expect(jobs.find((job) => job.id === first.id)?.state.lastStatus).toBe("ok");
    expect(jobs.find((job) => job.id === second.id)?.state.lastStatus).toBe("ok");
  });
+
+  // Regression: isolated cron runs must not abort at 1/3 of configured timeoutSeconds.
+  // The bug (issue #29774) caused the CLI-provider resume watchdog (ratio 0.3, maxMs 180 s)
+  // to be applied on fresh sessions because a persisted cliSessionId was passed to
+  // runCliAgent even when isNewSession=true.  At the service level this manifests as a
+  // job abort that fires much sooner than the configured outer timeout.
+  it("outer cron timeout fires at configured timeoutSeconds, not at 1/3 (#29774)", async () => {
+    vi.useRealTimers();
+    const store = await makeStorePath();
+    const scheduledAt = Date.parse("2026-02-15T13:00:00.000Z");
+
+    // Use a short but observable timeout: 300 ms.
+    // Before the fix, premature timeout would fire at ~100 ms (1/3 of 300 ms).
+    const timeoutSeconds = 0.3;
+    const cronJob = createIsolatedRegressionJob({
+      id: "timeout-fraction-29774",
+      name: "timeout fraction regression",
+      scheduledAt,
+      schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
+      payload: { kind: "agentTurn", message: "work", timeoutSeconds },
+      state: { nextRunAtMs: scheduledAt },
+    });
+    await writeCronJobs(store.storePath, [cronJob]);
+
+    const tempFile = path.join(os.tmpdir(), `cron-29774-${Date.now()}.txt`);
+    let now = scheduledAt;
+    const wallStart = Date.now();
+    let abortWallMs: number | undefined;
+
+    const state = createCronServiceState({
+      cronEnabled: true,
+      storePath: store.storePath,
+      log: noopLogger,
+      nowMs: () => now,
+      enqueueSystemEvent: vi.fn(),
+      requestHeartbeatNow: vi.fn(),
+      runIsolatedAgentJob: vi.fn(async ({ abortSignal }: { abortSignal?: AbortSignal }) => {
+        // Real side effect: confirm the job actually started.
+        await fs.writeFile(tempFile, "started", "utf-8");
+        await new Promise<void>((resolve) => {
+          if (!abortSignal) {
+            resolve();
+            return;
+          }
+          if (abortSignal.aborted) {
+            abortWallMs = Date.now();
+            resolve();
+            return;
+          }
+          abortSignal.addEventListener(
+            "abort",
+            () => {
+              abortWallMs = Date.now();
+              resolve();
+            },
+            { once: true },
+          );
+        });
+        now += 5;
+        return { status: "ok" as const, summary: "done" };
+      }),
+    });
+
+    await onTimer(state);
+
+    // Confirm job started (real side effect).
+    await expect(fs.readFile(tempFile, "utf-8")).resolves.toBe("started");
+    await fs.unlink(tempFile).catch(() => {});
+
+    // The outer cron timeout fires at timeoutSeconds * 1000 = 300 ms.
+    // The abort must not have fired at ~100 ms (the 1/3 regression value).
+    // Allow generous lower bound (80%) to keep the test stable on loaded CI runners.
+    const elapsedMs = (abortWallMs ?? Date.now()) - wallStart;
+    expect(elapsedMs).toBeGreaterThanOrEqual(timeoutSeconds * 1000 * 0.8);
+
+    const job = state.store?.jobs.find((entry) => entry.id === "timeout-fraction-29774");
+    expect(job?.state.lastStatus).toBe("error");
+    expect(job?.state.lastError).toContain("timed out");
+  });
 });