fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)

2026-03-08 12:21:41 +00:00 · 2026-03-08 12:21:41 +00:00 · dac220bd88
parent 2f5d3b6574
commit dac220bd88
4 changed files with 179 additions and 15 deletions
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@ -72,9 +72,16 @@ function getStatusCode(err: unknown): number | undefined {
  if (!err || typeof err !== "object") {
    return undefined;
  }
  // Dig into nested `err.error` shapes (e.g. Google Vertex abort wrappers)
  const nestedError =
    "error" in err && err.error && typeof err.error === "object"
      ? (err.error as { status?: unknown; code?: unknown })
      : undefined;
  const candidate =
    (err as { status?: unknown; statusCode?: unknown }).status ??
-    (err as { statusCode?: unknown }).statusCode;
+    (err as { statusCode?: unknown }).statusCode ??
    nestedError?.code ??
    nestedError?.status;
  if (typeof candidate === "number") {
    return candidate;
  }
@ -88,7 +95,11 @@ function getErrorCode(err: unknown): string | undefined {
  if (!err || typeof err !== "object") {
    return undefined;
  }
-  const candidate = (err as { code?: unknown }).code;
+  const nestedError =
    "error" in err && err.error && typeof err.error === "object"
      ? (err.error as { code?: unknown; status?: unknown })
      : undefined;
  const candidate = (err as { code?: unknown }).code ?? nestedError?.status ?? nestedError?.code;
  if (typeof candidate !== "string") {
    return undefined;
  }
@ -114,10 +125,53 @@ function getErrorMessage(err: unknown): string {
    if (typeof message === "string") {
      return message;
    }
    // Extract message from nested `err.error.message` (e.g. Google Vertex wrappers)
    const nestedMessage =
      "error" in err &&
      err.error &&
      typeof err.error === "object" &&
      typeof (err.error as { message?: unknown }).message === "string"
        ? ((err.error as { message: string }).message ?? "")
        : "";
    if (nestedMessage) {
      return nestedMessage;
    }
  }
  return "";
 }
 function getErrorCause(err: unknown): unknown {
  if (!err || typeof err !== "object" || !("cause" in err)) {
    return undefined;
  }
  return (err as { cause?: unknown }).cause;
 }
 /** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */
 function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null {
  const normalized = raw?.trim().toUpperCase();
  if (!normalized) {
    return null;
  }
  switch (normalized) {
    case "RESOURCE_EXHAUSTED":
    case "RATE_LIMIT":
    case "RATE_LIMITED":
    case "RATE_LIMIT_EXCEEDED":
    case "TOO_MANY_REQUESTS":
    case "THROTTLED":
    case "THROTTLING":
    case "THROTTLINGEXCEPTION":
    case "THROTTLING_EXCEPTION":
      return "rate_limit";
    case "OVERLOADED":
    case "OVERLOADED_ERROR":
      return "overloaded";
    default:
      return null;
  }
 }
 function hasTimeoutHint(err: unknown): boolean {
  if (!err) {
    return false;
@ -160,6 +214,12 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
    return statusReason;
  }
  // Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs)
  const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err));
  if (symbolicCodeReason) {
    return symbolicCodeReason;
  }
  const code = (getErrorCode(err) ?? "").toUpperCase();
  if (
    [
@ -181,6 +241,14 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
  if (isTimeoutError(err)) {
    return "timeout";
  }
  // Walk into error cause chain (e.g. AbortError wrapping a rate-limit cause)
  const cause = getErrorCause(err);
  if (cause && cause !== err) {
    const causeReason = resolveFailoverReasonFromError(cause);
    if (causeReason) {
      return causeReason;
    }
  }
  if (!message) {
    return null;
  }
--- a/src/agents/model-fallback.probe.test.ts
+++ b/src/agents/model-fallback.probe.test.ts
@ -331,6 +331,76 @@ describe("runWithModelFallback – probe logic", () => {
    });
  });
  it("keeps walking remaining fallbacks after an abort-wrapped RESOURCE_EXHAUSTED probe failure", async () => {
    const cfg = makeCfg({
      agents: {
        defaults: {
          model: {
            primary: "google/gemini-3-flash-preview",
            fallbacks: ["anthropic/claude-haiku-3-5", "deepseek/deepseek-chat"],
          },
        },
      },
    } as Partial<OpenClawConfig>);
    mockedResolveAuthProfileOrder.mockImplementation(({ provider }: { provider: string }) => {
      if (provider === "google") {
        return ["google-profile-1"];
      }
      if (provider === "anthropic") {
        return ["anthropic-profile-1"];
      }
      if (provider === "deepseek") {
        return ["deepseek-profile-1"];
      }
      return [];
    });
    mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) =>
      profileId.startsWith("google"),
    );
    mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000);
    mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
    // Simulate Google Vertex abort-wrapped RESOURCE_EXHAUSTED (the shape that was
    // previously swallowed by shouldRethrowAbort before the fallback loop could continue)
    const primaryAbort = Object.assign(new Error("request aborted"), {
      name: "AbortError",
      cause: {
        error: {
          code: 429,
          message: "Resource has been exhausted (e.g. check quota).",
          status: "RESOURCE_EXHAUSTED",
        },
      },
    });
    const run = vi
      .fn()
      .mockRejectedValueOnce(primaryAbort)
      .mockRejectedValueOnce(
        Object.assign(new Error("fallback still rate limited"), { status: 429 }),
      )
      .mockRejectedValueOnce(
        Object.assign(new Error("final fallback still rate limited"), { status: 429 }),
      );
    await expect(
      runWithModelFallback({
        cfg,
        provider: "google",
        model: "gemini-3-flash-preview",
        run,
      }),
    ).rejects.toThrow(/All models failed \(3\)/);
    // All three candidates must be attempted — the abort must not short-circuit
    expect(run).toHaveBeenCalledTimes(3);
    expect(run).toHaveBeenNthCalledWith(1, "google", "gemini-3-flash-preview", {
      allowTransientCooldownProbe: true,
    });
    expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5");
    expect(run).toHaveBeenNthCalledWith(3, "deepseek", "deepseek-chat");
  });
  it("throttles probe when called within 30s interval", async () => {
    const cfg = makeCfg();
    // Cooldown just about to expire (within probe margin)
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@ -140,10 +140,16 @@ async function runFallbackCandidate<T>(params: {
      result,
    };
  } catch (err) {
-    if (shouldRethrowAbort(err)) {
+    // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
    // so they become FailoverErrors and continue the fallback loop instead of aborting.
    const normalizedFailover = coerceToFailoverError(err, {
      provider: params.provider,
      model: params.model,
    });
    if (shouldRethrowAbort(err) && !normalizedFailover) {
      throw err;
    }
-    return { ok: false, error: err };
+    return { ok: false, error: normalizedFailover ?? err };
  }
 }
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@ -28,7 +28,12 @@ import {
  resolveContextWindowInfo,
 } from "../context-window-guard.js";
 import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
-import { FailoverError, resolveFailoverStatus } from "../failover-error.js";
+import {
  coerceToFailoverError,
  describeFailoverError,
  FailoverError,
  resolveFailoverStatus,
 } from "../failover-error.js";
 import {
  applyLocalNoAuthHeaderOverride,
  ensureAuthProfileStore,
@ -1217,7 +1222,17 @@ export async function runEmbeddedPiAgent(
          }
          if (promptError && !aborted) {
-            const errorText = describeUnknownError(promptError);
+            // Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
            // FailoverError so rate-limit classification works even for nested shapes.
            const normalizedPromptFailover = coerceToFailoverError(promptError, {
              provider: activeErrorContext.provider,
              model: activeErrorContext.model,
              profileId: lastProfileId,
            });
            const promptErrorDetails = normalizedPromptFailover
              ? describeFailoverError(normalizedPromptFailover)
              : describeFailoverError(promptError);
            const errorText = promptErrorDetails.message || describeUnknownError(promptError);
            if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
              authRetryPending = true;
              continue;
@ -1281,14 +1296,16 @@ export async function runEmbeddedPiAgent(
                },
              };
            }
-            const promptFailoverReason = classifyFailoverReason(errorText);
+            const promptFailoverReason =
              promptErrorDetails.reason ?? classifyFailoverReason(errorText);
            const promptProfileFailureReason =
              resolveAuthProfileFailureReason(promptFailoverReason);
            await maybeMarkAuthProfileFailure({
              profileId: lastProfileId,
              reason: promptProfileFailureReason,
            });
-            const promptFailoverFailure = isFailoverErrorMessage(errorText);
+            const promptFailoverFailure =
              promptFailoverReason !== null || isFailoverErrorMessage(errorText);
            // Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
            const failedPromptProfileId = lastProfileId;
            const logPromptFailoverDecision = createFailoverDecisionLogger({
@ -1330,13 +1347,16 @@ export async function runEmbeddedPiAgent(
              const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
              logPromptFailoverDecision("fallback_model", { status });
              await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
-              throw new FailoverError(errorText, {
+              throw (
-                reason: promptFailoverReason ?? "unknown",
+                normalizedPromptFailover ??
-                provider,
+                new FailoverError(errorText, {
-                model: modelId,
+                  reason: promptFailoverReason ?? "unknown",
-                profileId: lastProfileId,
+                  provider,
-                status,
+                  model: modelId,
-              });
+                  profileId: lastProfileId,
                  status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
                })
              );
            }
            if (promptFailoverFailure || promptFailoverReason) {
              logPromptFailoverDecision("surface_error");