mirror of https://github.com/openclaw/openclaw.git
fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)
This commit is contained in:
parent
2f5d3b6574
commit
dac220bd88
|
|
@ -72,9 +72,16 @@ function getStatusCode(err: unknown): number | undefined {
|
||||||
if (!err || typeof err !== "object") {
|
if (!err || typeof err !== "object") {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
// Dig into nested `err.error` shapes (e.g. Google Vertex abort wrappers)
|
||||||
|
const nestedError =
|
||||||
|
"error" in err && err.error && typeof err.error === "object"
|
||||||
|
? (err.error as { status?: unknown; code?: unknown })
|
||||||
|
: undefined;
|
||||||
const candidate =
|
const candidate =
|
||||||
(err as { status?: unknown; statusCode?: unknown }).status ??
|
(err as { status?: unknown; statusCode?: unknown }).status ??
|
||||||
(err as { statusCode?: unknown }).statusCode;
|
(err as { statusCode?: unknown }).statusCode ??
|
||||||
|
nestedError?.code ??
|
||||||
|
nestedError?.status;
|
||||||
if (typeof candidate === "number") {
|
if (typeof candidate === "number") {
|
||||||
return candidate;
|
return candidate;
|
||||||
}
|
}
|
||||||
|
|
@ -88,7 +95,11 @@ function getErrorCode(err: unknown): string | undefined {
|
||||||
if (!err || typeof err !== "object") {
|
if (!err || typeof err !== "object") {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
const candidate = (err as { code?: unknown }).code;
|
const nestedError =
|
||||||
|
"error" in err && err.error && typeof err.error === "object"
|
||||||
|
? (err.error as { code?: unknown; status?: unknown })
|
||||||
|
: undefined;
|
||||||
|
const candidate = (err as { code?: unknown }).code ?? nestedError?.status ?? nestedError?.code;
|
||||||
if (typeof candidate !== "string") {
|
if (typeof candidate !== "string") {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
@ -114,10 +125,53 @@ function getErrorMessage(err: unknown): string {
|
||||||
if (typeof message === "string") {
|
if (typeof message === "string") {
|
||||||
return message;
|
return message;
|
||||||
}
|
}
|
||||||
|
// Extract message from nested `err.error.message` (e.g. Google Vertex wrappers)
|
||||||
|
const nestedMessage =
|
||||||
|
"error" in err &&
|
||||||
|
err.error &&
|
||||||
|
typeof err.error === "object" &&
|
||||||
|
typeof (err.error as { message?: unknown }).message === "string"
|
||||||
|
? ((err.error as { message: string }).message ?? "")
|
||||||
|
: "";
|
||||||
|
if (nestedMessage) {
|
||||||
|
return nestedMessage;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getErrorCause(err: unknown): unknown {
|
||||||
|
if (!err || typeof err !== "object" || !("cause" in err)) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return (err as { cause?: unknown }).cause;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */
|
||||||
|
function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null {
|
||||||
|
const normalized = raw?.trim().toUpperCase();
|
||||||
|
if (!normalized) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
switch (normalized) {
|
||||||
|
case "RESOURCE_EXHAUSTED":
|
||||||
|
case "RATE_LIMIT":
|
||||||
|
case "RATE_LIMITED":
|
||||||
|
case "RATE_LIMIT_EXCEEDED":
|
||||||
|
case "TOO_MANY_REQUESTS":
|
||||||
|
case "THROTTLED":
|
||||||
|
case "THROTTLING":
|
||||||
|
case "THROTTLINGEXCEPTION":
|
||||||
|
case "THROTTLING_EXCEPTION":
|
||||||
|
return "rate_limit";
|
||||||
|
case "OVERLOADED":
|
||||||
|
case "OVERLOADED_ERROR":
|
||||||
|
return "overloaded";
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function hasTimeoutHint(err: unknown): boolean {
|
function hasTimeoutHint(err: unknown): boolean {
|
||||||
if (!err) {
|
if (!err) {
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -160,6 +214,12 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
||||||
return statusReason;
|
return statusReason;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs)
|
||||||
|
const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err));
|
||||||
|
if (symbolicCodeReason) {
|
||||||
|
return symbolicCodeReason;
|
||||||
|
}
|
||||||
|
|
||||||
const code = (getErrorCode(err) ?? "").toUpperCase();
|
const code = (getErrorCode(err) ?? "").toUpperCase();
|
||||||
if (
|
if (
|
||||||
[
|
[
|
||||||
|
|
@ -181,6 +241,14 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
||||||
if (isTimeoutError(err)) {
|
if (isTimeoutError(err)) {
|
||||||
return "timeout";
|
return "timeout";
|
||||||
}
|
}
|
||||||
|
// Walk into error cause chain (e.g. AbortError wrapping a rate-limit cause)
|
||||||
|
const cause = getErrorCause(err);
|
||||||
|
if (cause && cause !== err) {
|
||||||
|
const causeReason = resolveFailoverReasonFromError(cause);
|
||||||
|
if (causeReason) {
|
||||||
|
return causeReason;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!message) {
|
if (!message) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -331,6 +331,76 @@ describe("runWithModelFallback – probe logic", () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("keeps walking remaining fallbacks after an abort-wrapped RESOURCE_EXHAUSTED probe failure", async () => {
|
||||||
|
const cfg = makeCfg({
|
||||||
|
agents: {
|
||||||
|
defaults: {
|
||||||
|
model: {
|
||||||
|
primary: "google/gemini-3-flash-preview",
|
||||||
|
fallbacks: ["anthropic/claude-haiku-3-5", "deepseek/deepseek-chat"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
} as Partial<OpenClawConfig>);
|
||||||
|
|
||||||
|
mockedResolveAuthProfileOrder.mockImplementation(({ provider }: { provider: string }) => {
|
||||||
|
if (provider === "google") {
|
||||||
|
return ["google-profile-1"];
|
||||||
|
}
|
||||||
|
if (provider === "anthropic") {
|
||||||
|
return ["anthropic-profile-1"];
|
||||||
|
}
|
||||||
|
if (provider === "deepseek") {
|
||||||
|
return ["deepseek-profile-1"];
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
});
|
||||||
|
mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) =>
|
||||||
|
profileId.startsWith("google"),
|
||||||
|
);
|
||||||
|
mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000);
|
||||||
|
mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
|
||||||
|
|
||||||
|
// Simulate Google Vertex abort-wrapped RESOURCE_EXHAUSTED (the shape that was
|
||||||
|
// previously swallowed by shouldRethrowAbort before the fallback loop could continue)
|
||||||
|
const primaryAbort = Object.assign(new Error("request aborted"), {
|
||||||
|
name: "AbortError",
|
||||||
|
cause: {
|
||||||
|
error: {
|
||||||
|
code: 429,
|
||||||
|
message: "Resource has been exhausted (e.g. check quota).",
|
||||||
|
status: "RESOURCE_EXHAUSTED",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const run = vi
|
||||||
|
.fn()
|
||||||
|
.mockRejectedValueOnce(primaryAbort)
|
||||||
|
.mockRejectedValueOnce(
|
||||||
|
Object.assign(new Error("fallback still rate limited"), { status: 429 }),
|
||||||
|
)
|
||||||
|
.mockRejectedValueOnce(
|
||||||
|
Object.assign(new Error("final fallback still rate limited"), { status: 429 }),
|
||||||
|
);
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
runWithModelFallback({
|
||||||
|
cfg,
|
||||||
|
provider: "google",
|
||||||
|
model: "gemini-3-flash-preview",
|
||||||
|
run,
|
||||||
|
}),
|
||||||
|
).rejects.toThrow(/All models failed \(3\)/);
|
||||||
|
|
||||||
|
// All three candidates must be attempted — the abort must not short-circuit
|
||||||
|
expect(run).toHaveBeenCalledTimes(3);
|
||||||
|
expect(run).toHaveBeenNthCalledWith(1, "google", "gemini-3-flash-preview", {
|
||||||
|
allowTransientCooldownProbe: true,
|
||||||
|
});
|
||||||
|
expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5");
|
||||||
|
expect(run).toHaveBeenNthCalledWith(3, "deepseek", "deepseek-chat");
|
||||||
|
});
|
||||||
|
|
||||||
it("throttles probe when called within 30s interval", async () => {
|
it("throttles probe when called within 30s interval", async () => {
|
||||||
const cfg = makeCfg();
|
const cfg = makeCfg();
|
||||||
// Cooldown just about to expire (within probe margin)
|
// Cooldown just about to expire (within probe margin)
|
||||||
|
|
|
||||||
|
|
@ -140,10 +140,16 @@ async function runFallbackCandidate<T>(params: {
|
||||||
result,
|
result,
|
||||||
};
|
};
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (shouldRethrowAbort(err)) {
|
// Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
|
||||||
|
// so they become FailoverErrors and continue the fallback loop instead of aborting.
|
||||||
|
const normalizedFailover = coerceToFailoverError(err, {
|
||||||
|
provider: params.provider,
|
||||||
|
model: params.model,
|
||||||
|
});
|
||||||
|
if (shouldRethrowAbort(err) && !normalizedFailover) {
|
||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
return { ok: false, error: err };
|
return { ok: false, error: normalizedFailover ?? err };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,12 @@ import {
|
||||||
resolveContextWindowInfo,
|
resolveContextWindowInfo,
|
||||||
} from "../context-window-guard.js";
|
} from "../context-window-guard.js";
|
||||||
import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
|
import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
|
||||||
import { FailoverError, resolveFailoverStatus } from "../failover-error.js";
|
import {
|
||||||
|
coerceToFailoverError,
|
||||||
|
describeFailoverError,
|
||||||
|
FailoverError,
|
||||||
|
resolveFailoverStatus,
|
||||||
|
} from "../failover-error.js";
|
||||||
import {
|
import {
|
||||||
applyLocalNoAuthHeaderOverride,
|
applyLocalNoAuthHeaderOverride,
|
||||||
ensureAuthProfileStore,
|
ensureAuthProfileStore,
|
||||||
|
|
@ -1217,7 +1222,17 @@ export async function runEmbeddedPiAgent(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (promptError && !aborted) {
|
if (promptError && !aborted) {
|
||||||
const errorText = describeUnknownError(promptError);
|
// Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
|
||||||
|
// FailoverError so rate-limit classification works even for nested shapes.
|
||||||
|
const normalizedPromptFailover = coerceToFailoverError(promptError, {
|
||||||
|
provider: activeErrorContext.provider,
|
||||||
|
model: activeErrorContext.model,
|
||||||
|
profileId: lastProfileId,
|
||||||
|
});
|
||||||
|
const promptErrorDetails = normalizedPromptFailover
|
||||||
|
? describeFailoverError(normalizedPromptFailover)
|
||||||
|
: describeFailoverError(promptError);
|
||||||
|
const errorText = promptErrorDetails.message || describeUnknownError(promptError);
|
||||||
if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
|
if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
|
||||||
authRetryPending = true;
|
authRetryPending = true;
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -1281,14 +1296,16 @@ export async function runEmbeddedPiAgent(
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
const promptFailoverReason = classifyFailoverReason(errorText);
|
const promptFailoverReason =
|
||||||
|
promptErrorDetails.reason ?? classifyFailoverReason(errorText);
|
||||||
const promptProfileFailureReason =
|
const promptProfileFailureReason =
|
||||||
resolveAuthProfileFailureReason(promptFailoverReason);
|
resolveAuthProfileFailureReason(promptFailoverReason);
|
||||||
await maybeMarkAuthProfileFailure({
|
await maybeMarkAuthProfileFailure({
|
||||||
profileId: lastProfileId,
|
profileId: lastProfileId,
|
||||||
reason: promptProfileFailureReason,
|
reason: promptProfileFailureReason,
|
||||||
});
|
});
|
||||||
const promptFailoverFailure = isFailoverErrorMessage(errorText);
|
const promptFailoverFailure =
|
||||||
|
promptFailoverReason !== null || isFailoverErrorMessage(errorText);
|
||||||
// Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
|
// Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
|
||||||
const failedPromptProfileId = lastProfileId;
|
const failedPromptProfileId = lastProfileId;
|
||||||
const logPromptFailoverDecision = createFailoverDecisionLogger({
|
const logPromptFailoverDecision = createFailoverDecisionLogger({
|
||||||
|
|
@ -1330,13 +1347,16 @@ export async function runEmbeddedPiAgent(
|
||||||
const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
|
const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
|
||||||
logPromptFailoverDecision("fallback_model", { status });
|
logPromptFailoverDecision("fallback_model", { status });
|
||||||
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||||
throw new FailoverError(errorText, {
|
throw (
|
||||||
reason: promptFailoverReason ?? "unknown",
|
normalizedPromptFailover ??
|
||||||
provider,
|
new FailoverError(errorText, {
|
||||||
model: modelId,
|
reason: promptFailoverReason ?? "unknown",
|
||||||
profileId: lastProfileId,
|
provider,
|
||||||
status,
|
model: modelId,
|
||||||
});
|
profileId: lastProfileId,
|
||||||
|
status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
|
||||||
|
})
|
||||||
|
);
|
||||||
}
|
}
|
||||||
if (promptFailoverFailure || promptFailoverReason) {
|
if (promptFailoverFailure || promptFailoverReason) {
|
||||||
logPromptFailoverDecision("surface_error");
|
logPromptFailoverDecision("surface_error");
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue