fix(agents): normalize abort-wrapped RESOURCE_EXHAUSTED into failover errors (#11972)

2026-03-08 12:21:41 +00:00 · 2026-03-08 12:21:41 +00:00 · dac220bd88
parent 2f5d3b6574
commit dac220bd88
4 changed files with 179 additions and 15 deletions
--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@ -72,9 +72,16 @@ function getStatusCode(err: unknown): number | undefined {
  if (!err || typeof err !== "object") {
    return undefined;
  }
+  // Dig into nested `err.error` shapes (e.g. Google Vertex abort wrappers)
+  const nestedError =
+    "error" in err && err.error && typeof err.error === "object"
+      ? (err.error as { status?: unknown; code?: unknown })
+      : undefined;
  const candidate =
    (err as { status?: unknown; statusCode?: unknown }).status ??
-    (err as { statusCode?: unknown }).statusCode;
+    (err as { statusCode?: unknown }).statusCode ??
+    nestedError?.code ??
+    nestedError?.status;
  if (typeof candidate === "number") {
    return candidate;
  }
@ -88,7 +95,11 @@ function getErrorCode(err: unknown): string | undefined {
  if (!err || typeof err !== "object") {
    return undefined;
  }
-  const candidate = (err as { code?: unknown }).code;
+  const nestedError =
+    "error" in err && err.error && typeof err.error === "object"
+      ? (err.error as { code?: unknown; status?: unknown })
+      : undefined;
+  const candidate = (err as { code?: unknown }).code ?? nestedError?.status ?? nestedError?.code;
  if (typeof candidate !== "string") {
    return undefined;
  }
@ -114,10 +125,53 @@ function getErrorMessage(err: unknown): string {
    if (typeof message === "string") {
      return message;
    }
+    // Extract message from nested `err.error.message` (e.g. Google Vertex wrappers)
+    const nestedMessage =
+      "error" in err &&
+      err.error &&
+      typeof err.error === "object" &&
+      typeof (err.error as { message?: unknown }).message === "string"
+        ? ((err.error as { message: string }).message ?? "")
+        : "";
+    if (nestedMessage) {
+      return nestedMessage;
+    }
  }
  return "";
 }

+function getErrorCause(err: unknown): unknown {
+  if (!err || typeof err !== "object" || !("cause" in err)) {
+    return undefined;
+  }
+  return (err as { cause?: unknown }).cause;
+}
+
+/** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */
+function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null {
+  const normalized = raw?.trim().toUpperCase();
+  if (!normalized) {
+    return null;
+  }
+  switch (normalized) {
+    case "RESOURCE_EXHAUSTED":
+    case "RATE_LIMIT":
+    case "RATE_LIMITED":
+    case "RATE_LIMIT_EXCEEDED":
+    case "TOO_MANY_REQUESTS":
+    case "THROTTLED":
+    case "THROTTLING":
+    case "THROTTLINGEXCEPTION":
+    case "THROTTLING_EXCEPTION":
+      return "rate_limit";
+    case "OVERLOADED":
+    case "OVERLOADED_ERROR":
+      return "overloaded";
+    default:
+      return null;
+  }
+}
+
 function hasTimeoutHint(err: unknown): boolean {
  if (!err) {
    return false;
@ -160,6 +214,12 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
    return statusReason;
  }

+  // Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs)
+  const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err));
+  if (symbolicCodeReason) {
+    return symbolicCodeReason;
+  }
+
  const code = (getErrorCode(err) ?? "").toUpperCase();
  if (
    [
@ -181,6 +241,14 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
  if (isTimeoutError(err)) {
    return "timeout";
  }
+  // Walk into error cause chain (e.g. AbortError wrapping a rate-limit cause)
+  const cause = getErrorCause(err);
+  if (cause && cause !== err) {
+    const causeReason = resolveFailoverReasonFromError(cause);
+    if (causeReason) {
+      return causeReason;
+    }
+  }
  if (!message) {
    return null;
  }
--- a/src/agents/model-fallback.probe.test.ts
+++ b/src/agents/model-fallback.probe.test.ts
@ -331,6 +331,76 @@ describe("runWithModelFallback – probe logic", () => {
    });
  });

+  it("keeps walking remaining fallbacks after an abort-wrapped RESOURCE_EXHAUSTED probe failure", async () => {
+    const cfg = makeCfg({
+      agents: {
+        defaults: {
+          model: {
+            primary: "google/gemini-3-flash-preview",
+            fallbacks: ["anthropic/claude-haiku-3-5", "deepseek/deepseek-chat"],
+          },
+        },
+      },
+    } as Partial<OpenClawConfig>);
+
+    mockedResolveAuthProfileOrder.mockImplementation(({ provider }: { provider: string }) => {
+      if (provider === "google") {
+        return ["google-profile-1"];
+      }
+      if (provider === "anthropic") {
+        return ["anthropic-profile-1"];
+      }
+      if (provider === "deepseek") {
+        return ["deepseek-profile-1"];
+      }
+      return [];
+    });
+    mockedIsProfileInCooldown.mockImplementation((_store, profileId: string) =>
+      profileId.startsWith("google"),
+    );
+    mockedGetSoonestCooldownExpiry.mockReturnValue(NOW + 30 * 1000);
+    mockedResolveProfilesUnavailableReason.mockReturnValue("rate_limit");
+
+    // Simulate Google Vertex abort-wrapped RESOURCE_EXHAUSTED (the shape that was
+    // previously swallowed by shouldRethrowAbort before the fallback loop could continue)
+    const primaryAbort = Object.assign(new Error("request aborted"), {
+      name: "AbortError",
+      cause: {
+        error: {
+          code: 429,
+          message: "Resource has been exhausted (e.g. check quota).",
+          status: "RESOURCE_EXHAUSTED",
+        },
+      },
+    });
+    const run = vi
+      .fn()
+      .mockRejectedValueOnce(primaryAbort)
+      .mockRejectedValueOnce(
+        Object.assign(new Error("fallback still rate limited"), { status: 429 }),
+      )
+      .mockRejectedValueOnce(
+        Object.assign(new Error("final fallback still rate limited"), { status: 429 }),
+      );
+
+    await expect(
+      runWithModelFallback({
+        cfg,
+        provider: "google",
+        model: "gemini-3-flash-preview",
+        run,
+      }),
+    ).rejects.toThrow(/All models failed \(3\)/);
+
+    // All three candidates must be attempted — the abort must not short-circuit
+    expect(run).toHaveBeenCalledTimes(3);
+    expect(run).toHaveBeenNthCalledWith(1, "google", "gemini-3-flash-preview", {
+      allowTransientCooldownProbe: true,
+    });
+    expect(run).toHaveBeenNthCalledWith(2, "anthropic", "claude-haiku-3-5");
+    expect(run).toHaveBeenNthCalledWith(3, "deepseek", "deepseek-chat");
+  });
+
  it("throttles probe when called within 30s interval", async () => {
    const cfg = makeCfg();
    // Cooldown just about to expire (within probe margin)
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@ -140,10 +140,16 @@ async function runFallbackCandidate<T>(params: {
      result,
    };
  } catch (err) {
-    if (shouldRethrowAbort(err)) {
+    // Normalize abort-wrapped rate-limit errors (e.g. Google Vertex RESOURCE_EXHAUSTED)
+    // so they become FailoverErrors and continue the fallback loop instead of aborting.
+    const normalizedFailover = coerceToFailoverError(err, {
+      provider: params.provider,
+      model: params.model,
+    });
+    if (shouldRethrowAbort(err) && !normalizedFailover) {
      throw err;
    }
-    return { ok: false, error: err };
+    return { ok: false, error: normalizedFailover ?? err };
  }
 }

--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@ -28,7 +28,12 @@ import {
  resolveContextWindowInfo,
 } from "../context-window-guard.js";
 import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
-import { FailoverError, resolveFailoverStatus } from "../failover-error.js";
+import {
+  coerceToFailoverError,
+  describeFailoverError,
+  FailoverError,
+  resolveFailoverStatus,
+} from "../failover-error.js";
 import {
  applyLocalNoAuthHeaderOverride,
  ensureAuthProfileStore,
@ -1217,7 +1222,17 @@ export async function runEmbeddedPiAgent(
          }

          if (promptError && !aborted) {
-            const errorText = describeUnknownError(promptError);
+            // Normalize wrapped errors (e.g. abort-wrapped RESOURCE_EXHAUSTED) into
+            // FailoverError so rate-limit classification works even for nested shapes.
+            const normalizedPromptFailover = coerceToFailoverError(promptError, {
+              provider: activeErrorContext.provider,
+              model: activeErrorContext.model,
+              profileId: lastProfileId,
+            });
+            const promptErrorDetails = normalizedPromptFailover
+              ? describeFailoverError(normalizedPromptFailover)
+              : describeFailoverError(promptError);
+            const errorText = promptErrorDetails.message || describeUnknownError(promptError);
            if (await maybeRefreshCopilotForAuthError(errorText, copilotAuthRetry)) {
              authRetryPending = true;
              continue;
@ -1281,14 +1296,16 @@ export async function runEmbeddedPiAgent(
                },
              };
            }
-            const promptFailoverReason = classifyFailoverReason(errorText);
+            const promptFailoverReason =
+              promptErrorDetails.reason ?? classifyFailoverReason(errorText);
            const promptProfileFailureReason =
              resolveAuthProfileFailureReason(promptFailoverReason);
            await maybeMarkAuthProfileFailure({
              profileId: lastProfileId,
              reason: promptProfileFailureReason,
            });
-            const promptFailoverFailure = isFailoverErrorMessage(errorText);
+            const promptFailoverFailure =
+              promptFailoverReason !== null || isFailoverErrorMessage(errorText);
            // Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
            const failedPromptProfileId = lastProfileId;
            const logPromptFailoverDecision = createFailoverDecisionLogger({
@ -1330,13 +1347,16 @@ export async function runEmbeddedPiAgent(
              const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
              logPromptFailoverDecision("fallback_model", { status });
              await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
-              throw new FailoverError(errorText, {
-                reason: promptFailoverReason ?? "unknown",
-                provider,
-                model: modelId,
-                profileId: lastProfileId,
-                status,
-              });
+              throw (
+                normalizedPromptFailover ??
+                new FailoverError(errorText, {
+                  reason: promptFailoverReason ?? "unknown",
+                  provider,
+                  model: modelId,
+                  profileId: lastProfileId,
+                  status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
+                })
+              );
            }
            if (promptFailoverFailure || promptFailoverReason) {
              logPromptFailoverDecision("surface_error");