diff --git a/CHANGELOG.md b/CHANGELOG.md index f47b54a0df7..d578e359b6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai - Diffs: fall back to plain text when `lang` hints are invalid during diff render and viewer hydration, so bad or stale language values no longer break the diff viewer. (#57902) Thanks @gumadeiras. - Doctor/plugins: skip false Matrix legacy-helper warnings when no migration plans exist, and keep bundled `enabledByDefault` plugins in the gateway startup set. (#57931) Thanks @dinakars777. - Matrix/CLI send: start one-off Matrix send clients before outbound delivery so `openclaw message send --channel matrix` restores E2EE in encrypted rooms instead of sending plain events. (#57936) Thanks @gumadeiras. +- Cron/isolated sessions: carry the full live-session provider, model, and auth-profile selection across retry restarts so cron jobs with model overrides no longer fail or loop on mid-run model-switch requests. (#57972) Thanks @issaba1. - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss. - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc. - Tools/web_fetch: add an explicit trusted env-proxy path for proxy-only installs while keeping strict SSRF fetches on the pinned direct path, so trusted proxy routing does not weaken strict destination binding. (#50650) Thanks @kkav004. diff --git a/src/cron/isolated-agent/run.live-session-model-switch.test.ts b/src/cron/isolated-agent/run.live-session-model-switch.test.ts new file mode 100644 index 00000000000..db58894872f --- /dev/null +++ b/src/cron/isolated-agent/run.live-session-model-switch.test.ts @@ -0,0 +1,264 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { LiveSessionModelSwitchError } from "../../agents/live-model-switch.js"; +import { + clearFastTestEnv, + loadRunCronIsolatedAgentTurn, + logWarnMock, + makeCronSession, + makeCronSessionEntry, + resolveAllowedModelRefMock, + resolveConfiguredModelRefMock, + resolveCronSessionMock, + resolveSessionAuthProfileOverrideMock, + resetRunCronIsolatedAgentTurnHarness, + runEmbeddedPiAgentMock, + runWithModelFallbackMock, + updateSessionStoreMock, +} from "./run.test-harness.js"; + +const runCronIsolatedAgentTurn = await loadRunCronIsolatedAgentTurn(); + +// ---------- helpers ---------- + +function makeJob(overrides?: Record) { + return { + id: "cron-model-switch-job", + name: "Model Switch Test", + schedule: { kind: "cron", expr: "0 * * * *", tz: "UTC" }, + sessionTarget: "isolated", + payload: { + kind: "agentTurn", + message: "run task", + // Cron requests sonnet; agent primary is opus + model: "anthropic/claude-sonnet-4-6", + }, + ...overrides, + } as never; +} + +function makeParams(overrides?: Record) { + return { + cfg: {}, + deps: {} as never, + job: makeJob(), + message: "run task", + sessionKey: "cron:model-switch", + ...overrides, + }; +} + +function makeSuccessfulRunResult(modelUsed = "claude-sonnet-4-6") { + return { + result: { + payloads: [{ text: "task complete" }], + meta: { + agentMeta: { + model: modelUsed, + provider: "anthropic", + usage: { input: 100, output: 50 }, + }, + }, + }, + provider: "anthropic", + model: modelUsed, + attempts: [], + }; +} + +// ---------- tests ---------- + +describe("runCronIsolatedAgentTurn — LiveSessionModelSwitchError retry (#57206)", () => { + let previousFastTestEnv: string | undefined; + + beforeEach(async () => { + previousFastTestEnv = clearFastTestEnv(); + resetRunCronIsolatedAgentTurnHarness(); + + resolveConfiguredModelRefMock.mockReturnValue({ + provider: "anthropic", + model: "claude-opus-4-6", + }); + resolveAllowedModelRefMock.mockImplementation(({ raw }: { raw: string }) => { + const [provider, model] = raw.split("/"); + return { ref: { provider, model } }; + }); + resolveCronSessionMock.mockReturnValue( + makeCronSession({ + sessionEntry: makeCronSessionEntry({ + model: undefined, + modelProvider: undefined, + }), + isNewSession: true, + }), + ); + updateSessionStoreMock.mockResolvedValue(undefined); + logWarnMock.mockReturnValue(undefined); + }); + + afterEach(() => { + if (previousFastTestEnv !== undefined) { + process.env.OPENCLAW_TEST_FAST = previousFastTestEnv; + } else { + delete process.env.OPENCLAW_TEST_FAST; + } + }); + + it("retries with the requested model when runWithModelFallback throws LiveSessionModelSwitchError on the first attempt", async () => { + const switchError = new LiveSessionModelSwitchError({ + provider: "anthropic", + model: "claude-sonnet-4-6", + }); + + let callCount = 0; + runWithModelFallbackMock.mockImplementation( + async (params: { + provider: string; + model: string; + run: (p: string, m: string) => Promise; + }) => { + callCount++; + if (callCount === 1) { + // First attempt: session started with opus, throw to request sonnet + throw switchError; + } + // Second attempt: should now be called with sonnet + expect(params.provider).toBe("anthropic"); + expect(params.model).toBe("claude-sonnet-4-6"); + return makeSuccessfulRunResult("claude-sonnet-4-6"); + }, + ); + + const result = await runCronIsolatedAgentTurn(makeParams()); + + expect(result.status).toBe("ok"); + expect(callCount).toBe(2); + }); + + it("persists switched provider/model before retrying", async () => { + const cronSession = makeCronSession({ + sessionEntry: makeCronSessionEntry({ + model: undefined, + modelProvider: undefined, + }), + isNewSession: true, + }); + resolveCronSessionMock.mockReturnValue(cronSession); + const switchError = new LiveSessionModelSwitchError({ + provider: "anthropic", + model: "claude-sonnet-4-6", + }); + + runWithModelFallbackMock.mockImplementation(async () => { + throw switchError; + }); + runWithModelFallbackMock + .mockRejectedValueOnce(switchError) + .mockRejectedValueOnce(new Error("transient network error")); + + const result = await runCronIsolatedAgentTurn(makeParams()); + + expect(result.status).toBe("error"); + expect(String(result.error)).toContain("transient network error"); + expect(updateSessionStoreMock).toHaveBeenCalled(); + expect(cronSession.sessionEntry).toMatchObject({ + model: "claude-sonnet-4-6", + modelProvider: "anthropic", + }); + }); + + it("retries with switched auth profile state from LiveSessionModelSwitchError", async () => { + resolveSessionAuthProfileOverrideMock.mockResolvedValue("profile-a"); + const cronSession = makeCronSession({ + sessionEntry: makeCronSessionEntry({ + model: undefined, + modelProvider: undefined, + authProfileOverride: "profile-a", + authProfileOverrideSource: "auto", + compactionCount: 7, + }), + isNewSession: true, + }); + resolveCronSessionMock.mockReturnValue(cronSession); + runWithModelFallbackMock.mockImplementation(async ({ provider, model, run }) => ({ + result: await run(provider, model), + provider, + model, + attempts: [], + })); + runEmbeddedPiAgentMock + .mockRejectedValueOnce( + new LiveSessionModelSwitchError({ + provider: "anthropic", + model: "claude-sonnet-4-6", + authProfileId: "profile-b", + authProfileIdSource: "user", + }), + ) + .mockResolvedValueOnce({ + payloads: [{ text: "task complete" }], + meta: { + agentMeta: { + provider: "anthropic", + model: "claude-sonnet-4-6", + usage: { input: 100, output: 50 }, + }, + }, + }); + + const result = await runCronIsolatedAgentTurn(makeParams()); + + expect(result.status).toBe("ok"); + expect(runEmbeddedPiAgentMock).toHaveBeenCalledTimes(2); + expect(runEmbeddedPiAgentMock.mock.calls[1]?.[0]).toMatchObject({ + provider: "anthropic", + model: "claude-sonnet-4-6", + authProfileId: "profile-b", + authProfileIdSource: "user", + }); + expect(cronSession.sessionEntry).toMatchObject({ + authProfileOverride: "profile-b", + authProfileOverrideSource: "user", + }); + }); + + it("returns error (not infinite loop) when LiveSessionModelSwitchError is thrown repeatedly", async () => { + // If the runner somehow keeps throwing the same error (e.g. broken catalog) + // it should not loop forever. The inner runPrompt itself will eventually + // surface an error from within the model fallback path, but we simulate + // a different error on the second attempt to ensure the outer catch still works. + const switchError = new LiveSessionModelSwitchError({ + provider: "anthropic", + model: "claude-sonnet-4-6", + }); + + let callCount = 0; + runWithModelFallbackMock.mockImplementation(async () => { + callCount++; + if (callCount <= 1) { + throw switchError; + } + // Second attempt throws a different error — should propagate up + throw new Error("transient network error"); + }); + + const result = await runCronIsolatedAgentTurn(makeParams()); + + expect(result.status).toBe("error"); + expect(String(result.error)).toContain("transient network error"); + // Switched once, then failed + expect(callCount).toBe(2); + }); + + it("does not retry when the thrown error is not a LiveSessionModelSwitchError", async () => { + let callCount = 0; + runWithModelFallbackMock.mockImplementation(async () => { + callCount++; + throw new Error("some other error"); + }); + + const result = await runCronIsolatedAgentTurn(makeParams()); + + expect(result.status).toBe("error"); + expect(callCount).toBe(1); + }); +}); diff --git a/src/cron/isolated-agent/run.ts b/src/cron/isolated-agent/run.ts index 1e551832ebb..1f9d84fddf6 100644 --- a/src/cron/isolated-agent/run.ts +++ b/src/cron/isolated-agent/run.ts @@ -14,6 +14,7 @@ import { resolveCronStyleNow } from "../../agents/current-time.js"; import { DEFAULT_CONTEXT_TOKENS } from "../../agents/defaults.js"; import { resolveFastModeState } from "../../agents/fast-mode.js"; import { resolveNestedAgentLane } from "../../agents/lanes.js"; +import { LiveSessionModelSwitchError } from "../../agents/live-model-switch.js"; import { loadModelCatalog } from "../../agents/model-catalog.js"; import { runWithModelFallback } from "../../agents/model-fallback.js"; import { isCliProvider, resolveThinkingDefault } from "../../agents/model-selection.js"; @@ -426,11 +427,36 @@ export async function runCronIsolatedAgentTurn(params: { storePath: cronSession.storePath, isNewSession: cronSession.isNewSession, }); - const authProfileIdSource = cronSession.sessionEntry.authProfileOverrideSource; + let liveSelection = { + provider, + model, + authProfileId, + authProfileIdSource: authProfileId + ? cronSession.sessionEntry.authProfileOverrideSource + : undefined, + }; + const syncSessionEntryLiveSelection = () => { + cronSession.sessionEntry.modelProvider = liveSelection.provider; + cronSession.sessionEntry.model = liveSelection.model; + if (liveSelection.authProfileId) { + cronSession.sessionEntry.authProfileOverride = liveSelection.authProfileId; + cronSession.sessionEntry.authProfileOverrideSource = liveSelection.authProfileIdSource; + if (liveSelection.authProfileIdSource === "auto") { + cronSession.sessionEntry.authProfileOverrideCompactionCount = + cronSession.sessionEntry.compactionCount ?? 0; + } else { + delete cronSession.sessionEntry.authProfileOverrideCompactionCount; + } + return; + } + delete cronSession.sessionEntry.authProfileOverride; + delete cronSession.sessionEntry.authProfileOverrideSource; + delete cronSession.sessionEntry.authProfileOverrideCompactionCount; + }; let runResult: Awaited> | undefined; - let fallbackProvider = provider; - let fallbackModel = model; + let fallbackProvider = liveSelection.provider; + let fallbackModel = liveSelection.model; const runStartedAt = Date.now(); let runEndedAt = runStartedAt; try { @@ -456,8 +482,8 @@ export async function runCronIsolatedAgentTurn(params: { const runPrompt = async (promptText: string) => { const fallbackResult = await runWithModelFallback({ cfg: cfgWithAgentDefaults, - provider, - model, + provider: liveSelection.provider, + model: liveSelection.model, runId: cronSession.sessionEntry.sessionId, agentDir, fallbacksOverride: @@ -521,8 +547,10 @@ export async function runCronIsolatedAgentTurn(params: { lane: resolveNestedAgentLane(params.lane), provider: providerOverride, model: modelOverride, - authProfileId, - authProfileIdSource, + authProfileId: liveSelection.authProfileId, + authProfileIdSource: liveSelection.authProfileId + ? liveSelection.authProfileIdSource + : undefined, thinkLevel, fastMode: resolveFastModeState({ cfg: cfgWithAgentDefaults, @@ -552,12 +580,46 @@ export async function runCronIsolatedAgentTurn(params: { runResult = fallbackResult.result; fallbackProvider = fallbackResult.provider; fallbackModel = fallbackResult.model; - provider = fallbackResult.provider; - model = fallbackResult.model; + liveSelection.provider = fallbackResult.provider; + liveSelection.model = fallbackResult.model; runEndedAt = Date.now(); }; - await runPrompt(commandBody); + // Retry loop: if the isolated session starts with the wrong model (e.g. the + // gateway default) and the runner detects a LiveSessionModelSwitchError, we + // restart with the model requested by the error — mirroring the retry logic + // in the main agent runner (agent-runner-execution.ts). Without this, cron + // jobs that specify a model different from the agent primary always fail. + // See: https://github.com/openclaw/openclaw/issues/57206 + while (true) { + try { + await runPrompt(commandBody); + break; + } catch (err) { + if (err instanceof LiveSessionModelSwitchError) { + liveSelection = { + provider: err.provider, + model: err.model, + authProfileId: err.authProfileId, + authProfileIdSource: err.authProfileId ? err.authProfileIdSource : undefined, + }; + fallbackProvider = err.provider; + fallbackModel = err.model; + syncSessionEntryLiveSelection(); + // Persist the corrected model before retrying so sessions_list + // reflects the real model even if the retry also fails. + try { + await persistSessionEntry(); + } catch (persistErr) { + logWarn( + `[cron:${params.job.id}] Failed to persist model switch session entry: ${String(persistErr)}`, + ); + } + continue; + } + throw err; + } + } if (!runResult) { throw new Error("cron isolated run returned no result"); } @@ -624,8 +686,9 @@ export async function runCronIsolatedAgentTurn(params: { } const usage = finalRunResult.meta?.agentMeta?.usage; const promptTokens = finalRunResult.meta?.agentMeta?.promptTokens; - const modelUsed = finalRunResult.meta?.agentMeta?.model ?? fallbackModel ?? model; - const providerUsed = finalRunResult.meta?.agentMeta?.provider ?? fallbackProvider ?? provider; + const modelUsed = finalRunResult.meta?.agentMeta?.model ?? fallbackModel ?? liveSelection.model; + const providerUsed = + finalRunResult.meta?.agentMeta?.provider ?? fallbackProvider ?? liveSelection.provider; const contextTokens = agentCfg?.contextTokens ?? lookupContextTokens(modelUsed, { allowAsyncLoad: false }) ??