fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)

This commit is contained in:
Catalin Lupuleti 2026-03-08 12:21:41 +00:00 committed by Darshil
parent 2f5d3b6574
commit dac220bd88
4 changed files with 179 additions and 15 deletions

View File

@ -72,9 +72,16 @@ function getStatusCode(err: unknown): number | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
// Dig into nested `err.error` shapes (e.g. Google Vertex abort wrappers)
const nestedError =
"error" in err && err.error && typeof err.error === "object"
? (err.error as { status?: unknown; code?: unknown })
: undefined;
const candidate =
(err as { status?: unknown; statusCode?: unknown }).status ??
(err as { statusCode?: unknown }).statusCode;
(err as { statusCode?: unknown }).statusCode ??
nestedError?.code ??
nestedError?.status;
if (typeof candidate === "number") {
return candidate;
}
@ -88,7 +95,11 @@ function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const candidate = (err as { code?: unknown }).code;
const nestedError =
"error" in err && err.error && typeof err.error === "object"
? (err.error as { code?: unknown; status?: unknown })
: undefined;
const candidate = (err as { code?: unknown }).code ?? nestedError?.status ?? nestedError?.code;
if (typeof candidate !== "string") {
return undefined;
}
@ -114,10 +125,53 @@ function getErrorMessage(err: unknown): string {
if (typeof message === "string") {
return message;
}
// Extract message from nested `err.error.message` (e.g. Google Vertex wrappers)
const nestedMessage =
"error" in err &&
err.error &&
typeof err.error === "object" &&
typeof (err.error as { message?: unknown }).message === "string"
? ((err.error as { message: string }).message ?? "")
: "";
if (nestedMessage) {
return nestedMessage;
}
}
return "";
}
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object" || !("cause" in err)) {
return undefined;
}
return (err as { cause?: unknown }).cause;
}
/** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */
function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null {
const normalized = raw?.trim().toUpperCase();
if (!normalized) {
return null;
}
switch (normalized) {
case "RESOURCE_EXHAUSTED":
case "RATE_LIMIT":
case "RATE_LIMITED":
case "RATE_LIMIT_EXCEEDED":
case "TOO_MANY_REQUESTS":
case "THROTTLED":
case "THROTTLING":
case "THROTTLINGEXCEPTION":
case "THROTTLING_EXCEPTION":
return "rate_limit";
case "OVERLOADED":
case "OVERLOADED_ERROR":
return "overloaded";
default:
return null;
}
}
function hasTimeoutHint(err: unknown): boolean {
if (!err) {
return false;
@ -160,6 +214,12 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
return statusReason;
}
// Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs)
const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err));
if (symbolicCodeReason) {
return symbolicCodeReason;
}
const code = (getErrorCode(err) ?? "").toUpperCase();
if (
[
@ -181,6 +241,14 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
if (isTimeoutError(err)) {
return "timeout";
}
// Walk into error cause chain (e.g. AbortError wrapping a rate-limit cause)
const cause = getErrorCause(err);
if (cause && cause !== err) {
const causeReason = resolveFailoverReasonFromError(cause);
if (causeReason) {
return causeReason;
}
}
if (!message) {
return null;
}

View File

@ -331,6 +331,76 @@ describe("runWithModelFallback probe logic", () => {
});
});
it("keeps walking remaining fallbacks after an abort-wrapped RESOURCE_EXHAUSTED probe failure", async () => {
const cfg = makeCfg({
agents: {
defaults: {
model: {
primary: "google/gemini-3-flash-preview",
fallbacks: ["anthropic/claude-haiku-3-5", "deepseek/deepseek-chat"],
},
},
},
} as Partial<OpenClawConfig>);
mockedResolveAuthProfileOrder.mockImplementation(({ provider }: { provider: string }) => {
if (provider === "google") {
return ["google-profile-1"];
}
if (provider === "anthropic") {
return ["anthropic-profile-1"];
}
if (provider === "deepseek") {
return ["deepseek-profile-1"];
}
return [];
});
mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) =>
profileId.startsWith("google"),
);
mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000);
mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
// Simulate Google Vertex abort-wrapped RESOURCE_EXHAUSTED (the shape that was
// previously swallowed by shouldRethrowAbort before the fallback loop could continue)
const primaryAbort = Object.assign(new Error("request aborted"), {
name: "AbortError",
cause: {
error: {
code: 429,
message: "Resource has been exhausted (e.g. check quota).",
status: "RESOURCE_EXHAUSTED",
},
},
});
const run = vi
.fn()
.mockRejectedValueOnce(primaryAbort)
.mockRejectedValueOnce(
Object.assign(new Error("fallback still rate limited"), { status: 429 }),
)
.mockRejectedValueOnce(
Object.assign(new Error("final fallback still rate limited"), { status: 429 }),
);
await expect(
runWithModelFallback({
cfg,
provider: "google",
model: "gemini-3-flash-preview",
run,
}),
).rejects.toThrow(/All models failed \(3\)/);
// All three candidates must be attempted — the abort must not short-circuit
expect(run).toHaveBeenCalledTimes(3);
expect(run).toHaveBeenNthCalledWith(1, "google", "gemini-3-flash-preview", {
allowTransientCooldownProbe: true,
});
expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5");
expect(run).toHaveBeenNthCalledWith(3, "deepseek", "deepseek-chat");
});
it("throttles probe when called within 30s interval", async () => {
const cfg = makeCfg();
// Cooldown just about to expire (within probe margin)

View File

@ -140,10 +140,16 @@ async function runFallbackCandidate<T>(params: {
result,
};
} catch (err) {
if (shouldRethrowAbort(err)) {
// Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
// so they become FailoverErrors and continue the fallback loop instead of aborting.
const normalizedFailover = coerceToFailoverError(err, {
provider: params.provider,
model: params.model,
});
if (shouldRethrowAbort(err) && !normalizedFailover) {
throw err;
}
return { ok: false, error: err };
return { ok: false, error: normalizedFailover ?? err };
}
}

View File

@ -28,7 +28,12 @@ import {
resolveContextWindowInfo,
} from "../context-window-guard.js";
import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
import { FailoverError, resolveFailoverStatus } from "../failover-error.js";
import {
coerceToFailoverError,
describeFailoverError,
FailoverError,
resolveFailoverStatus,
} from "../failover-error.js";
import {
applyLocalNoAuthHeaderOverride,
ensureAuthProfileStore,
@ -1217,7 +1222,17 @@ export async function runEmbeddedPiAgent(
}
if (promptError && !aborted) {
const errorText = describeUnknownError(promptError);
// Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
// FailoverError so rate-limit classification works even for nested shapes.
const normalizedPromptFailover = coerceToFailoverError(promptError, {
provider: activeErrorContext.provider,
model: activeErrorContext.model,
profileId: lastProfileId,
});
const promptErrorDetails = normalizedPromptFailover
? describeFailoverError(normalizedPromptFailover)
: describeFailoverError(promptError);
const errorText = promptErrorDetails.message || describeUnknownError(promptError);
if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
authRetryPending = true;
continue;
@ -1281,14 +1296,16 @@ export async function runEmbeddedPiAgent(
},
};
}
const promptFailoverReason = classifyFailoverReason(errorText);
const promptFailoverReason =
promptErrorDetails.reason ?? classifyFailoverReason(errorText);
const promptProfileFailureReason =
resolveAuthProfileFailureReason(promptFailoverReason);
await maybeMarkAuthProfileFailure({
profileId: lastProfileId,
reason: promptProfileFailureReason,
});
const promptFailoverFailure = isFailoverErrorMessage(errorText);
const promptFailoverFailure =
promptFailoverReason !== null || isFailoverErrorMessage(errorText);
// Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
const failedPromptProfileId = lastProfileId;
const logPromptFailoverDecision = createFailoverDecisionLogger({
@ -1330,13 +1347,16 @@ export async function runEmbeddedPiAgent(
const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
logPromptFailoverDecision("fallback_model", { status });
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
throw new FailoverError(errorText, {
reason: promptFailoverReason ?? "unknown",
provider,
model: modelId,
profileId: lastProfileId,
status,
});
throw (
normalizedPromptFailover ??
new FailoverError(errorText, {
reason: promptFailoverReason ?? "unknown",
provider,
model: modelId,
profileId: lastProfileId,
status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
})
);
}
if (promptFailoverFailure || promptFailoverReason) {
logPromptFailoverDecision("surface_error");