fix: complete cron isolated model-switch retry (#57972) (thanks @issaba1)

* fix: handle LiveSessionModelSwitchError in cron isolated sessions The main agent runner catches LiveSessionModelSwitchError and retries with the requested model, but cron isolated sessions hit this error and fail immediately. This extends the retry to cover cron execution. When a cron job with `sessionTarget: 'isolated'` specifies a `model` different from the agent's primary, the embedded runner throws LiveSessionModelSwitchError (because the session initialized with the wrong model). The fix wraps the initial runPrompt call in a retry loop that catches this error, updates provider/model state, and re-runs — mirroring the existing retry logic in agent-runner-execution.ts. Fixes #57206 * fix: carry auth profile through cron model retry * fix: complete cron isolated model-switch retry (#57972) (thanks @issaba1) --------- Co-authored-by: Isaac Saba <isaacsaba@Isaacs-Mac-mini.local> Co-authored-by: Ayaan Zaidi <hi@obviy.us>
2026-03-30 22:33:37 -06:00 · 2026-03-30 22:33:37 -06:00 · 10ac6ead6b
parent 7516b423eb
commit 10ac6ead6b
3 changed files with 340 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
 - Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras.
 - Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777.
 - Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras.
+- Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1.
 - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
 - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
 - Tools/web_fetch: add an explicit trusted env-proxy path for proxy-only installs while keeping strict SSRF fetches on the pinned direct path, so trusted proxy routing does not weaken strict destination binding. (#50650) Thanks @kkav004.
--- a/src/cron/isolated-agent/run.live-session-model-switch.test.ts
+++ b/src/cron/isolated-agent/run.live-session-model-switch.test.ts
@ -0,0 +1,264 @@
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { LiveSessionModelSwitchError } from "../../agents/live-model-switch.js";
+import {
+  clearFastTestEnv,
+  loadRunCronIsolatedAgentTurn,
+  logWarnMock,
+  makeCronSession,
+  makeCronSessionEntry,
+  resolveAllowedModelRefMock,
+  resolveConfiguredModelRefMock,
+  resolveCronSessionMock,
+  resolveSessionAuthProfileOverrideMock,
+  resetRunCronIsolatedAgentTurnHarness,
+  runEmbeddedPiAgentMock,
+  runWithModelFallbackMock,
+  updateSessionStoreMock,
+} from "./run.test-harness.js";
+
+const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn();
+
+// ---------- helpers ----------
+
+function makeJob(overrides?: Record<string, unknown>) {
+  return {
+    id: "cron-model-switch-job",
+    name: "Model Switch Test",
+    schedule: { kind: "cron", expr: "0 * * * *", tz: "UTC" },
+    sessionTarget: "isolated",
+    payload: {
+      kind: "agentTurn",
+      message: "run task",
+      // Cron requests sonnet; agent primary is opus
+      model: "anthropic/claude-sonnet-4-6",
+    },
+    ...overrides,
+  } as never;
+}
+
+function makeParams(overrides?: Record<string, unknown>) {
+  return {
+    cfg: {},
+    deps: {} as never,
+    job: makeJob(),
+    message: "run task",
+    sessionKey: "cron:model-switch",
+    ...overrides,
+  };
+}
+
+function makeSuccessfulRunResult(modelUsed = "claude-sonnet-4-6") {
+  return {
+    result: {
+      payloads: [{ text: "task complete" }],
+      meta: {
+        agentMeta: {
+          model: modelUsed,
+          provider: "anthropic",
+          usage: { input: 100, output: 50 },
+        },
+      },
+    },
+    provider: "anthropic",
+    model: modelUsed,
+    attempts: [],
+  };
+}
+
+// ---------- tests ----------
+
+describe("runCronIsolatedAgentTurn — LiveSessionModelSwitchError retry (#57206)", () => {
+  let previousFastTestEnv: string | undefined;
+
+  beforeEach(async () => {
+    previousFastTestEnv = clearFastTestEnv();
+    resetRunCronIsolatedAgentTurnHarness();
+
+    resolveConfiguredModelRefMock.mockReturnValue({
+      provider: "anthropic",
+      model: "claude-opus-4-6",
+    });
+    resolveAllowedModelRefMock.mockImplementation(({ raw }: { raw: string }) => {
+      const [provider, model] = raw.split("/");
+      return { ref: { provider, model } };
+    });
+    resolveCronSessionMock.mockReturnValue(
+      makeCronSession({
+        sessionEntry: makeCronSessionEntry({
+          model: undefined,
+          modelProvider: undefined,
+        }),
+        isNewSession: true,
+      }),
+    );
+    updateSessionStoreMock.mockResolvedValue(undefined);
+    logWarnMock.mockReturnValue(undefined);
+  });
+
+  afterEach(() => {
+    if (previousFastTestEnv !== undefined) {
+      process.env.OPENCLAW_TEST_FAST = previousFastTestEnv;
+    } else {
+      delete process.env.OPENCLAW_TEST_FAST;
+    }
+  });
+
+  it("retries with the requested model when runWithModelFallback throws LiveSessionModelSwitchError on the first attempt", async () => {
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+
+    let callCount = 0;
+    runWithModelFallbackMock.mockImplementation(
+      async (params: {
+        provider: string;
+        model: string;
+        run: (p: string, m: string) => Promise<unknown>;
+      }) => {
+        callCount++;
+        if (callCount === 1) {
+          // First attempt: session started with opus, throw to request sonnet
+          throw switchError;
+        }
+        // Second attempt: should now be called with sonnet
+        expect(params.provider).toBe("anthropic");
+        expect(params.model).toBe("claude-sonnet-4-6");
+        return makeSuccessfulRunResult("claude-sonnet-4-6");
+      },
+    );
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("ok");
+    expect(callCount).toBe(2);
+  });
+
+  it("persists switched provider/model before retrying", async () => {
+    const cronSession = makeCronSession({
+      sessionEntry: makeCronSessionEntry({
+        model: undefined,
+        modelProvider: undefined,
+      }),
+      isNewSession: true,
+    });
+    resolveCronSessionMock.mockReturnValue(cronSession);
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+
+    runWithModelFallbackMock.mockImplementation(async () => {
+      throw switchError;
+    });
+    runWithModelFallbackMock
+      .mockRejectedValueOnce(switchError)
+      .mockRejectedValueOnce(new Error("transient network error"));
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("error");
+    expect(String(result.error)).toContain("transient network error");
+    expect(updateSessionStoreMock).toHaveBeenCalled();
+    expect(cronSession.sessionEntry).toMatchObject({
+      model: "claude-sonnet-4-6",
+      modelProvider: "anthropic",
+    });
+  });
+
+  it("retries with switched auth profile state from LiveSessionModelSwitchError", async () => {
+    resolveSessionAuthProfileOverrideMock.mockResolvedValue("profile-a");
+    const cronSession = makeCronSession({
+      sessionEntry: makeCronSessionEntry({
+        model: undefined,
+        modelProvider: undefined,
+        authProfileOverride: "profile-a",
+        authProfileOverrideSource: "auto",
+        compactionCount: 7,
+      }),
+      isNewSession: true,
+    });
+    resolveCronSessionMock.mockReturnValue(cronSession);
+    runWithModelFallbackMock.mockImplementation(async ({ provider, model, run }) => ({
+      result: await run(provider, model),
+      provider,
+      model,
+      attempts: [],
+    }));
+    runEmbeddedPiAgentMock
+      .mockRejectedValueOnce(
+        new LiveSessionModelSwitchError({
+          provider: "anthropic",
+          model: "claude-sonnet-4-6",
+          authProfileId: "profile-b",
+          authProfileIdSource: "user",
+        }),
+      )
+      .mockResolvedValueOnce({
+        payloads: [{ text: "task complete" }],
+        meta: {
+          agentMeta: {
+            provider: "anthropic",
+            model: "claude-sonnet-4-6",
+            usage: { input: 100, output: 50 },
+          },
+        },
+      });
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("ok");
+    expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(2);
+    expect(runEmbeddedPiAgentMock.mock.calls[1]?.[0]).toMatchObject({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+      authProfileId: "profile-b",
+      authProfileIdSource: "user",
+    });
+    expect(cronSession.sessionEntry).toMatchObject({
+      authProfileOverride: "profile-b",
+      authProfileOverrideSource: "user",
+    });
+  });
+
+  it("returns error (not infinite loop) when LiveSessionModelSwitchError is thrown repeatedly", async () => {
+    // If the runner somehow keeps throwing the same error (e.g. broken catalog)
+    // it should not loop forever. The inner runPrompt itself will eventually
+    // surface an error from within the model fallback path, but we simulate
+    // a different error on the second attempt to ensure the outer catch still works.
+    const switchError = new LiveSessionModelSwitchError({
+      provider: "anthropic",
+      model: "claude-sonnet-4-6",
+    });
+
+    let callCount = 0;
+    runWithModelFallbackMock.mockImplementation(async () => {
+      callCount++;
+      if (callCount <= 1) {
+        throw switchError;
+      }
+      // Second attempt throws a different error — should propagate up
+      throw new Error("transient network error");
+    });
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("error");
+    expect(String(result.error)).toContain("transient network error");
+    // Switched once, then failed
+    expect(callCount).toBe(2);
+  });
+
+  it("does not retry when the thrown error is not a LiveSessionModelSwitchError", async () => {
+    let callCount = 0;
+    runWithModelFallbackMock.mockImplementation(async () => {
+      callCount++;
+      throw new Error("some other error");
+    });
+
+    const result = await runCronIsolatedAgentTurn(makeParams());
+
+    expect(result.status).toBe("error");
+    expect(callCount).toBe(1);
+  });
+});
--- a/src/cron/isolated-agent/run.ts
+++ b/src/cron/isolated-agent/run.ts
@ -14,6 +14,7 @@ import { resolveCronStyleNow } from "../../agents/current-time.js";
 import { DEFAULT_CONTEXT_TOKENS } from "../../agents/defaults.js";
 import { resolveFastModeState } from "../../agents/fast-mode.js";
 import { resolveNestedAgentLane } from "../../agents/lanes.js";
+import { LiveSessionModelSwitchError } from "../../agents/live-model-switch.js";
 import { loadModelCatalog } from "../../agents/model-catalog.js";
 import { runWithModelFallback } from "../../agents/model-fallback.js";
 import { isCliProvider, resolveThinkingDefault } from "../../agents/model-selection.js";
@ -426,11 +427,36 @@ export async function runCronIsolatedAgentTurn(params: {
    storePath: cronSession.storePath,
    isNewSession: cronSession.isNewSession,
  });
-  const authProfileIdSource = cronSession.sessionEntry.authProfileOverrideSource;
+  let liveSelection = {
+    provider,
+    model,
+    authProfileId,
+    authProfileIdSource: authProfileId
+      ? cronSession.sessionEntry.authProfileOverrideSource
+      : undefined,
+  };
+  const syncSessionEntryLiveSelection = () => {
+    cronSession.sessionEntry.modelProvider = liveSelection.provider;
+    cronSession.sessionEntry.model = liveSelection.model;
+    if (liveSelection.authProfileId) {
+      cronSession.sessionEntry.authProfileOverride = liveSelection.authProfileId;
+      cronSession.sessionEntry.authProfileOverrideSource = liveSelection.authProfileIdSource;
+      if (liveSelection.authProfileIdSource === "auto") {
+        cronSession.sessionEntry.authProfileOverrideCompactionCount =
+          cronSession.sessionEntry.compactionCount ?? 0;
+      } else {
+        delete cronSession.sessionEntry.authProfileOverrideCompactionCount;
+      }
+      return;
+    }
+    delete cronSession.sessionEntry.authProfileOverride;
+    delete cronSession.sessionEntry.authProfileOverrideSource;
+    delete cronSession.sessionEntry.authProfileOverrideCompactionCount;
+  };

  let runResult: Awaited<ReturnType<typeof runEmbeddedPiAgent>> | undefined;
-  let fallbackProvider = provider;
-  let fallbackModel = model;
+  let fallbackProvider = liveSelection.provider;
+  let fallbackModel = liveSelection.model;
  const runStartedAt = Date.now();
  let runEndedAt = runStartedAt;
  try {
@ -456,8 +482,8 @@ export async function runCronIsolatedAgentTurn(params: {
    const runPrompt = async (promptText: string) => {
      const fallbackResult = await runWithModelFallback({
        cfg: cfgWithAgentDefaults,
-        provider,
-        model,
+        provider: liveSelection.provider,
+        model: liveSelection.model,
        runId: cronSession.sessionEntry.sessionId,
        agentDir,
        fallbacksOverride:
@ -521,8 +547,10 @@ export async function runCronIsolatedAgentTurn(params: {
            lane: resolveNestedAgentLane(params.lane),
            provider: providerOverride,
            model: modelOverride,
-            authProfileId,
-            authProfileIdSource,
+            authProfileId: liveSelection.authProfileId,
+            authProfileIdSource: liveSelection.authProfileId
+              ? liveSelection.authProfileIdSource
+              : undefined,
            thinkLevel,
            fastMode: resolveFastModeState({
              cfg: cfgWithAgentDefaults,
@ -552,12 +580,46 @@ export async function runCronIsolatedAgentTurn(params: {
      runResult = fallbackResult.result;
      fallbackProvider = fallbackResult.provider;
      fallbackModel = fallbackResult.model;
-      provider = fallbackResult.provider;
-      model = fallbackResult.model;
+      liveSelection.provider = fallbackResult.provider;
+      liveSelection.model = fallbackResult.model;
      runEndedAt = Date.now();
    };

-    await runPrompt(commandBody);
+    // Retry loop: if the isolated session starts with the wrong model (e.g. the
+    // gateway default) and the runner detects a LiveSessionModelSwitchError, we
+    // restart with the model requested by the error — mirroring the retry logic
+    // in the main agent runner (agent-runner-execution.ts). Without this, cron
+    // jobs that specify a model different from the agent primary always fail.
+    // See: https://github.com/openclaw/openclaw/issues/57206
+    while (true) {
+      try {
+        await runPrompt(commandBody);
+        break;
+      } catch (err) {
+        if (err instanceof LiveSessionModelSwitchError) {
+          liveSelection = {
+            provider: err.provider,
+            model: err.model,
+            authProfileId: err.authProfileId,
+            authProfileIdSource: err.authProfileId ? err.authProfileIdSource : undefined,
+          };
+          fallbackProvider = err.provider;
+          fallbackModel = err.model;
+          syncSessionEntryLiveSelection();
+          // Persist the corrected model before retrying so sessions_list
+          // reflects the real model even if the retry also fails.
+          try {
+            await persistSessionEntry();
+          } catch (persistErr) {
+            logWarn(
+              `[cron:${params.job.id}] Failed to persist model switch session entry: ${String(persistErr)}`,
+            );
+          }
+          continue;
+        }
+        throw err;
+      }
+    }
    if (!runResult) {
      throw new Error("cron isolated run returned no result");
    }
@ -624,8 +686,9 @@ export async function runCronIsolatedAgentTurn(params: {
    }
    const usage = finalRunResult.meta?.agentMeta?.usage;
    const promptTokens = finalRunResult.meta?.agentMeta?.promptTokens;
-    const modelUsed = finalRunResult.meta?.agentMeta?.model ?? fallbackModel ?? model;
-    const providerUsed = finalRunResult.meta?.agentMeta?.provider ?? fallbackProvider ?? provider;
+    const modelUsed = finalRunResult.meta?.agentMeta?.model ?? fallbackModel ?? liveSelection.model;
+    const providerUsed =
+      finalRunResult.meta?.agentMeta?.provider ?? fallbackProvider ?? liveSelection.provider;
    const contextTokens =
      agentCfg?.contextTokens ??
      lookupContextTokens(modelUsed, { allowAsyncLoad: false }) ??