From 048e25c2b21d56962c482ea63f7fe78795194609 Mon Sep 17 00:00:00 2001 From: Charles Dusek <38732970+cgdusek@users.noreply.github.com> Date: Tue, 10 Mar 2026 07:26:47 -0500 Subject: [PATCH] fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711) Merged via squash. Prepared head SHA: 8be8967bcb4be81f6abc5ff078644ec4efcfe7a0 Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com> Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Reviewed-by: @altaywtf --- CHANGELOG.md | 1 + src/agents/model-fallback.test.ts | 80 +++++++++++++++++++++++++++++++ src/agents/model-fallback.ts | 48 +++++++++++++++++++ 3 files changed, 129 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 155bc867062..81c3f0d67c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,7 @@ Docs: https://docs.openclaw.ai - Mattermost/plugin send actions: normalize direct `replyTo` fallback handling so threaded plugin sends trim blank IDs and reuse the correct reply target again. (#41176) Thanks @hnykda. - MS Teams/allowlist resolution: use the General channel conversation ID as the resolved team key (with Graph GUID fallback) so Bot Framework runtime `channelData.team.id` matching works for team and team/channel allowlist entries. (#41838) Thanks @BradGroux. - Mattermost/Markdown formatting: preserve first-line indentation when stripping bot mentions so nested list items and indented code blocks keep their structure, and render Mattermost tables natively by default instead of fenced-code fallback. (#18655) thanks @echo931. +- Agents/fallback cooldown probing: cap cooldown-bypass probing to one attempt per provider per fallback run so multi-model same-provider cooldown chains can continue to cross-provider fallbacks instead of repeatedly stalling on duplicate cooldown probes. (#41711) Thanks @cgdusek. ## 2026.3.8 diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index e4c84028e95..8bc1a6ecb47 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -1318,6 +1318,86 @@ describe("runWithModelFallback", () => { }); // Rate limit allows attempt expect(run).toHaveBeenNthCalledWith(2, "groq", "llama-3.3-70b-versatile"); // Cross-provider works }); + + it("limits cooldown probes to one per provider before moving to cross-provider fallback", async () => { + const { dir } = await makeAuthStoreWithCooldown("anthropic", "rate_limit"); + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "anthropic/claude-opus-4-6", + fallbacks: [ + "anthropic/claude-sonnet-4-5", + "anthropic/claude-haiku-3-5", + "groq/llama-3.3-70b-versatile", + ], + }, + }, + }, + }); + + const run = vi + .fn() + .mockRejectedValueOnce(new Error("Still rate limited")) // First same-provider probe fails + .mockResolvedValueOnce("groq success"); // Next provider succeeds + + const result = await runWithModelFallback({ + cfg, + provider: "anthropic", + model: "claude-opus-4-6", + run, + agentDir: dir, + }); + + expect(result.result).toBe("groq success"); + // Primary is skipped, first same-provider fallback is probed, second same-provider fallback + // is skipped (probe already attempted), then cross-provider fallback runs. + expect(run).toHaveBeenCalledTimes(2); + expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5", { + allowTransientCooldownProbe: true, + }); + expect(run).toHaveBeenNthCalledWith(2, "groq", "llama-3.3-70b-versatile"); + }); + + it("does not consume transient probe slot when first same-provider probe fails with model_not_found", async () => { + const { dir } = await makeAuthStoreWithCooldown("anthropic", "rate_limit"); + const cfg = makeCfg({ + agents: { + defaults: { + model: { + primary: "anthropic/claude-opus-4-6", + fallbacks: [ + "anthropic/claude-sonnet-4-5", + "anthropic/claude-haiku-3-5", + "groq/llama-3.3-70b-versatile", + ], + }, + }, + }, + }); + + const run = vi + .fn() + .mockRejectedValueOnce(new Error("Model not found: anthropic/claude-sonnet-4-5")) + .mockResolvedValueOnce("haiku success"); + + const result = await runWithModelFallback({ + cfg, + provider: "anthropic", + model: "claude-opus-4-6", + run, + agentDir: dir, + }); + + expect(result.result).toBe("haiku success"); + expect(run).toHaveBeenCalledTimes(2); + expect(run).toHaveBeenNthCalledWith(1, "anthropic", "claude-sonnet-4-5", { + allowTransientCooldownProbe: true, + }); + expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5", { + allowTransientCooldownProbe: true, + }); + }); }); }); diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index 373e10c936f..cda7771d329 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -521,6 +521,7 @@ export async function runWithModelFallback(params: { : null; const attempts: FallbackAttempt[] = []; let lastError: unknown; + const cooldownProbeUsedProviders = new Set(); const hasFallbackCandidates = candidates.length > 1; @@ -531,6 +532,7 @@ export async function runWithModelFallback(params: { params.provider === candidate.provider && params.model === candidate.model; let runOptions: ModelFallbackRunOptions | undefined; let attemptedDuringCooldown = false; + let transientProbeProviderForAttempt: string | null = null; if (authStore) { const profileIds = resolveAuthProfileOrder({ cfg: params.cfg, @@ -588,7 +590,41 @@ export async function runWithModelFallback(params: { decision.reason === "overloaded" || decision.reason === "billing" ) { + // Probe at most once per provider per fallback run when all profiles + // are cooldowned. Re-probing every same-provider candidate can stall + // cross-provider fallback on providers with long internal retries. + const isTransientCooldownReason = + decision.reason === "rate_limit" || decision.reason === "overloaded"; + if (isTransientCooldownReason && cooldownProbeUsedProviders.has(candidate.provider)) { + const error = `Provider ${candidate.provider} is in cooldown (probe already attempted this run)`; + attempts.push({ + provider: candidate.provider, + model: candidate.model, + error, + reason: decision.reason, + }); + logModelFallbackDecision({ + decision: "skip_candidate", + runId: params.runId, + requestedProvider: params.provider, + requestedModel: params.model, + candidate, + attempt: i + 1, + total: candidates.length, + reason: decision.reason, + error, + nextCandidate: candidates[i + 1], + isPrimary, + requestedModelMatched: requestedModel, + fallbackConfigured: hasFallbackCandidates, + profileCount: profileIds.length, + }); + continue; + } runOptions = { allowTransientCooldownProbe: true }; + if (isTransientCooldownReason) { + transientProbeProviderForAttempt = candidate.provider; + } } attemptedDuringCooldown = true; logModelFallbackDecision({ @@ -643,6 +679,18 @@ export async function runWithModelFallback(params: { } const err = attemptRun.error; { + if (transientProbeProviderForAttempt) { + const probeFailureReason = describeFailoverError(err).reason; + const shouldPreserveTransientProbeSlot = + probeFailureReason === "model_not_found" || + probeFailureReason === "format" || + probeFailureReason === "auth" || + probeFailureReason === "auth_permanent" || + probeFailureReason === "session_expired"; + if (!shouldPreserveTransientProbeSlot) { + cooldownProbeUsedProviders.add(transientProbeProviderForAttempt); + } + } // Context overflow errors should be handled by the inner runner's // compaction/retry logic, not by model fallback. If one escapes as a // throw, rethrow it immediately rather than trying a different model