mirror of https://github.com/openclaw/openclaw.git
fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)
This commit is contained in:
parent
2f5d3b6574
commit
dac220bd88
|
|
@ -72,9 +72,16 @@ function getStatusCode(err: unknown): number | undefined {
|
|||
if (!err || typeof err !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
// Dig into nested `err.error` shapes (e.g. Google Vertex abort wrappers)
|
||||
const nestedError =
|
||||
"error" in err && err.error && typeof err.error === "object"
|
||||
? (err.error as { status?: unknown; code?: unknown })
|
||||
: undefined;
|
||||
const candidate =
|
||||
(err as { status?: unknown; statusCode?: unknown }).status ??
|
||||
(err as { statusCode?: unknown }).statusCode;
|
||||
(err as { statusCode?: unknown }).statusCode ??
|
||||
nestedError?.code ??
|
||||
nestedError?.status;
|
||||
if (typeof candidate === "number") {
|
||||
return candidate;
|
||||
}
|
||||
|
|
@ -88,7 +95,11 @@ function getErrorCode(err: unknown): string | undefined {
|
|||
if (!err || typeof err !== "object") {
|
||||
return undefined;
|
||||
}
|
||||
const candidate = (err as { code?: unknown }).code;
|
||||
const nestedError =
|
||||
"error" in err && err.error && typeof err.error === "object"
|
||||
? (err.error as { code?: unknown; status?: unknown })
|
||||
: undefined;
|
||||
const candidate = (err as { code?: unknown }).code ?? nestedError?.status ?? nestedError?.code;
|
||||
if (typeof candidate !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
|
|
@ -114,10 +125,53 @@ function getErrorMessage(err: unknown): string {
|
|||
if (typeof message === "string") {
|
||||
return message;
|
||||
}
|
||||
// Extract message from nested `err.error.message` (e.g. Google Vertex wrappers)
|
||||
const nestedMessage =
|
||||
"error" in err &&
|
||||
err.error &&
|
||||
typeof err.error === "object" &&
|
||||
typeof (err.error as { message?: unknown }).message === "string"
|
||||
? ((err.error as { message: string }).message ?? "")
|
||||
: "";
|
||||
if (nestedMessage) {
|
||||
return nestedMessage;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function getErrorCause(err: unknown): unknown {
|
||||
if (!err || typeof err !== "object" || !("cause" in err)) {
|
||||
return undefined;
|
||||
}
|
||||
return (err as { cause?: unknown }).cause;
|
||||
}
|
||||
|
||||
/** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */
|
||||
function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null {
|
||||
const normalized = raw?.trim().toUpperCase();
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
switch (normalized) {
|
||||
case "RESOURCE_EXHAUSTED":
|
||||
case "RATE_LIMIT":
|
||||
case "RATE_LIMITED":
|
||||
case "RATE_LIMIT_EXCEEDED":
|
||||
case "TOO_MANY_REQUESTS":
|
||||
case "THROTTLED":
|
||||
case "THROTTLING":
|
||||
case "THROTTLINGEXCEPTION":
|
||||
case "THROTTLING_EXCEPTION":
|
||||
return "rate_limit";
|
||||
case "OVERLOADED":
|
||||
case "OVERLOADED_ERROR":
|
||||
return "overloaded";
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function hasTimeoutHint(err: unknown): boolean {
|
||||
if (!err) {
|
||||
return false;
|
||||
|
|
@ -160,6 +214,12 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
|||
return statusReason;
|
||||
}
|
||||
|
||||
// Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs)
|
||||
const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err));
|
||||
if (symbolicCodeReason) {
|
||||
return symbolicCodeReason;
|
||||
}
|
||||
|
||||
const code = (getErrorCode(err) ?? "").toUpperCase();
|
||||
if (
|
||||
[
|
||||
|
|
@ -181,6 +241,14 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
|||
if (isTimeoutError(err)) {
|
||||
return "timeout";
|
||||
}
|
||||
// Walk into error cause chain (e.g. AbortError wrapping a rate-limit cause)
|
||||
const cause = getErrorCause(err);
|
||||
if (cause && cause !== err) {
|
||||
const causeReason = resolveFailoverReasonFromError(cause);
|
||||
if (causeReason) {
|
||||
return causeReason;
|
||||
}
|
||||
}
|
||||
if (!message) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -331,6 +331,76 @@ describe("runWithModelFallback – probe logic", () => {
|
|||
});
|
||||
});
|
||||
|
||||
it("keeps walking remaining fallbacks after an abort-wrapped RESOURCE_EXHAUSTED probe failure", async () => {
|
||||
const cfg = makeCfg({
|
||||
agents: {
|
||||
defaults: {
|
||||
model: {
|
||||
primary: "google/gemini-3-flash-preview",
|
||||
fallbacks: ["anthropic/claude-haiku-3-5", "deepseek/deepseek-chat"],
|
||||
},
|
||||
},
|
||||
},
|
||||
} as Partial<OpenClawConfig>);
|
||||
|
||||
mockedResolveAuthProfileOrder.mockImplementation(({ provider }: { provider: string }) => {
|
||||
if (provider === "google") {
|
||||
return ["google-profile-1"];
|
||||
}
|
||||
if (provider === "anthropic") {
|
||||
return ["anthropic-profile-1"];
|
||||
}
|
||||
if (provider === "deepseek") {
|
||||
return ["deepseek-profile-1"];
|
||||
}
|
||||
return [];
|
||||
});
|
||||
mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) =>
|
||||
profileId.startsWith("google"),
|
||||
);
|
||||
mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000);
|
||||
mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
|
||||
|
||||
// Simulate Google Vertex abort-wrapped RESOURCE_EXHAUSTED (the shape that was
|
||||
// previously swallowed by shouldRethrowAbort before the fallback loop could continue)
|
||||
const primaryAbort = Object.assign(new Error("request aborted"), {
|
||||
name: "AbortError",
|
||||
cause: {
|
||||
error: {
|
||||
code: 429,
|
||||
message: "Resource has been exhausted (e.g. check quota).",
|
||||
status: "RESOURCE_EXHAUSTED",
|
||||
},
|
||||
},
|
||||
});
|
||||
const run = vi
|
||||
.fn()
|
||||
.mockRejectedValueOnce(primaryAbort)
|
||||
.mockRejectedValueOnce(
|
||||
Object.assign(new Error("fallback still rate limited"), { status: 429 }),
|
||||
)
|
||||
.mockRejectedValueOnce(
|
||||
Object.assign(new Error("final fallback still rate limited"), { status: 429 }),
|
||||
);
|
||||
|
||||
await expect(
|
||||
runWithModelFallback({
|
||||
cfg,
|
||||
provider: "google",
|
||||
model: "gemini-3-flash-preview",
|
||||
run,
|
||||
}),
|
||||
).rejects.toThrow(/All models failed \(3\)/);
|
||||
|
||||
// All three candidates must be attempted — the abort must not short-circuit
|
||||
expect(run).toHaveBeenCalledTimes(3);
|
||||
expect(run).toHaveBeenNthCalledWith(1, "google", "gemini-3-flash-preview", {
|
||||
allowTransientCooldownProbe: true,
|
||||
});
|
||||
expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5");
|
||||
expect(run).toHaveBeenNthCalledWith(3, "deepseek", "deepseek-chat");
|
||||
});
|
||||
|
||||
it("throttles probe when called within 30s interval", async () => {
|
||||
const cfg = makeCfg();
|
||||
// Cooldown just about to expire (within probe margin)
|
||||
|
|
|
|||
|
|
@ -140,10 +140,16 @@ async function runFallbackCandidate<T>(params: {
|
|||
result,
|
||||
};
|
||||
} catch (err) {
|
||||
if (shouldRethrowAbort(err)) {
|
||||
// Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
|
||||
// so they become FailoverErrors and continue the fallback loop instead of aborting.
|
||||
const normalizedFailover = coerceToFailoverError(err, {
|
||||
provider: params.provider,
|
||||
model: params.model,
|
||||
});
|
||||
if (shouldRethrowAbort(err) && !normalizedFailover) {
|
||||
throw err;
|
||||
}
|
||||
return { ok: false, error: err };
|
||||
return { ok: false, error: normalizedFailover ?? err };
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,12 @@ import {
|
|||
resolveContextWindowInfo,
|
||||
} from "../context-window-guard.js";
|
||||
import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
|
||||
import { FailoverError, resolveFailoverStatus } from "../failover-error.js";
|
||||
import {
|
||||
coerceToFailoverError,
|
||||
describeFailoverError,
|
||||
FailoverError,
|
||||
resolveFailoverStatus,
|
||||
} from "../failover-error.js";
|
||||
import {
|
||||
applyLocalNoAuthHeaderOverride,
|
||||
ensureAuthProfileStore,
|
||||
|
|
@ -1217,7 +1222,17 @@ export async function runEmbeddedPiAgent(
|
|||
}
|
||||
|
||||
if (promptError && !aborted) {
|
||||
const errorText = describeUnknownError(promptError);
|
||||
// Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
|
||||
// FailoverError so rate-limit classification works even for nested shapes.
|
||||
const normalizedPromptFailover = coerceToFailoverError(promptError, {
|
||||
provider: activeErrorContext.provider,
|
||||
model: activeErrorContext.model,
|
||||
profileId: lastProfileId,
|
||||
});
|
||||
const promptErrorDetails = normalizedPromptFailover
|
||||
? describeFailoverError(normalizedPromptFailover)
|
||||
: describeFailoverError(promptError);
|
||||
const errorText = promptErrorDetails.message || describeUnknownError(promptError);
|
||||
if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
|
||||
authRetryPending = true;
|
||||
continue;
|
||||
|
|
@ -1281,14 +1296,16 @@ export async function runEmbeddedPiAgent(
|
|||
},
|
||||
};
|
||||
}
|
||||
const promptFailoverReason = classifyFailoverReason(errorText);
|
||||
const promptFailoverReason =
|
||||
promptErrorDetails.reason ?? classifyFailoverReason(errorText);
|
||||
const promptProfileFailureReason =
|
||||
resolveAuthProfileFailureReason(promptFailoverReason);
|
||||
await maybeMarkAuthProfileFailure({
|
||||
profileId: lastProfileId,
|
||||
reason: promptProfileFailureReason,
|
||||
});
|
||||
const promptFailoverFailure = isFailoverErrorMessage(errorText);
|
||||
const promptFailoverFailure =
|
||||
promptFailoverReason !== null || isFailoverErrorMessage(errorText);
|
||||
// Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
|
||||
const failedPromptProfileId = lastProfileId;
|
||||
const logPromptFailoverDecision = createFailoverDecisionLogger({
|
||||
|
|
@ -1330,13 +1347,16 @@ export async function runEmbeddedPiAgent(
|
|||
const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
|
||||
logPromptFailoverDecision("fallback_model", { status });
|
||||
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||
throw new FailoverError(errorText, {
|
||||
reason: promptFailoverReason ?? "unknown",
|
||||
provider,
|
||||
model: modelId,
|
||||
profileId: lastProfileId,
|
||||
status,
|
||||
});
|
||||
throw (
|
||||
normalizedPromptFailover ??
|
||||
new FailoverError(errorText, {
|
||||
reason: promptFailoverReason ?? "unknown",
|
||||
provider,
|
||||
model: modelId,
|
||||
profileId: lastProfileId,
|
||||
status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
|
||||
})
|
||||
);
|
||||
}
|
||||
if (promptFailoverFailure || promptFailoverReason) {
|
||||
logPromptFailoverDecision("surface_error");
|
||||
|
|
|
|||
Loading…
Reference in New Issue