From c96ee423009ad7f2e8a56e72ec30e90af8eb9da5 Mon Sep 17 00:00:00 2001
From: Aaron Zhu <aaron@Aarons-MacBook-Air.local>
Date: Wed, 1 Apr 2026 15:29:59 +0800
Subject: [PATCH] fix(agents): normalize provider errors for better failover

Add provider-specific error patterns for AWS Bedrock, Ollama, Mistral,
Cohere, DeepSeek, Together AI, and Cloudflare Workers AI. These providers
return errors in non-standard formats that the generic classifiers miss,
causing incorrect failover behavior (e.g., context overflow misclassified
as format error, ThrottlingException not recognized as rate limit).

Wire provider patterns into isContextOverflowError() and
classifyFailoverReason() as catch-all layers after generic classifiers.
---
 src/agents/pi-embedded-helpers/errors.ts      |  13 +-
 .../provider-error-patterns.test.ts           | 100 ++++++++++++++++
 .../provider-error-patterns.ts                | 111 ++++++++++++++++++
 3 files changed, 223 insertions(+), 1 deletion(-)
 create mode 100644 src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
 create mode 100644 src/agents/pi-embedded-helpers/provider-error-patterns.ts

diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts
index 68a843469cc..c2ba6b22e43 100644
--- a/src/agents/pi-embedded-helpers/errors.ts
+++ b/src/agents/pi-embedded-helpers/errors.ts
@@ -27,6 +27,10 @@ import {
   isTimeoutErrorMessage,
   matchesFormatErrorPattern,
 } from "./failover-matches.js";
+import {
+  classifyProviderSpecificError,
+  matchesProviderContextOverflow,
+} from "./provider-error-patterns.js";
 import type { FailoverReason } from "./types.js";
 
 export {
@@ -235,7 +239,9 @@ export function isContextOverflowError(errorMessage?: string): boolean {
     errorMessage.includes("上下文超出") ||
     errorMessage.includes("上下文长度超") ||
     errorMessage.includes("超出最大上下文") ||
-    errorMessage.includes("请压缩上下文")
+    errorMessage.includes("请压缩上下文") ||
+    // Provider-specific patterns (Bedrock, Azure, Ollama, Mistral, Cohere, etc.)
+    matchesProviderContextOverflow(errorMessage)
   );
 }
 
@@ -1090,6 +1096,11 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
   if (isTimeoutErrorMessage(raw)) {
     return "timeout";
   }
+  // Provider-specific patterns as a final catch (Bedrock, Groq, Together AI, etc.)
+  const providerSpecific = classifyProviderSpecificError(raw);
+  if (providerSpecific) {
+    return providerSpecific;
+  }
   return null;
 }
 
diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
new file mode 100644
index 00000000000..ca4db4dfd19
--- /dev/null
+++ b/src/agents/pi-embedded-helpers/provider-error-patterns.test.ts
@@ -0,0 +1,100 @@
+import { describe, expect, it } from "vitest";
+import { classifyFailoverReason, isContextOverflowError } from "./errors.js";
+import {
+  classifyProviderSpecificError,
+  matchesProviderContextOverflow,
+} from "./provider-error-patterns.js";
+
+describe("matchesProviderContextOverflow", () => {
+  it.each([
+    // AWS Bedrock
+    "ValidationException: The input is too long for the model",
+    "ValidationException: Input token count exceeds the maximum number of input tokens",
+    "ModelStreamErrorException: Input is too long for this model",
+
+    // Google Vertex
+    "INVALID_ARGUMENT: input exceeds the maximum number of tokens",
+
+    // Ollama
+    "ollama error: context length exceeded, too many tokens",
+
+    // Mistral
+    "mistral: input is too long for this model",
+
+    // Cohere
+    "total tokens exceeds the model's maximum limit of 4096",
+
+    // Generic
+    "input is too long for model gpt-5.4",
+  ])("matches provider-specific overflow: %s", (msg) => {
+    expect(matchesProviderContextOverflow(msg)).toBe(true);
+  });
+
+  it("does not match unrelated errors", () => {
+    expect(matchesProviderContextOverflow("rate limit exceeded")).toBe(false);
+    expect(matchesProviderContextOverflow("invalid api key")).toBe(false);
+    expect(matchesProviderContextOverflow("internal server error")).toBe(false);
+  });
+});
+
+describe("classifyProviderSpecificError", () => {
+  it("classifies Bedrock ThrottlingException as rate_limit", () => {
+    expect(classifyProviderSpecificError("ThrottlingException: Too many requests")).toBe(
+      "rate_limit",
+    );
+  });
+
+  it("classifies Bedrock ModelNotReadyException as overloaded", () => {
+    expect(classifyProviderSpecificError("ModelNotReadyException: model is not ready")).toBe(
+      "overloaded",
+    );
+  });
+
+  it("classifies Groq model_deactivated as model_not_found", () => {
+    expect(classifyProviderSpecificError("model_is_deactivated")).toBe("model_not_found");
+  });
+
+  it("classifies concurrency limit as rate_limit", () => {
+    expect(classifyProviderSpecificError("concurrency limit has been reached")).toBe("rate_limit");
+    expect(classifyProviderSpecificError("concurrency limit reached")).toBe("rate_limit");
+  });
+
+  it("does not match generic 'model is not ready' without Bedrock prefix", () => {
+    expect(classifyProviderSpecificError("model is not ready")).toBeNull();
+  });
+
+  it("returns null for unmatched errors", () => {
+    expect(classifyProviderSpecificError("some random error")).toBeNull();
+  });
+});
+
+describe("isContextOverflowError with provider patterns", () => {
+  it("detects Bedrock ValidationException as context overflow", () => {
+    expect(isContextOverflowError("ValidationException: The input is too long for the model")).toBe(
+      true,
+    );
+  });
+
+  it("detects Ollama context overflow", () => {
+    expect(isContextOverflowError("ollama error: context length exceeded")).toBe(true);
+  });
+
+  it("still detects standard context overflow patterns", () => {
+    expect(isContextOverflowError("context length exceeded")).toBe(true);
+    expect(isContextOverflowError("prompt is too long: 150000 tokens > 128000 maximum")).toBe(true);
+  });
+});
+
+describe("classifyFailoverReason with provider patterns", () => {
+  it("classifies Bedrock ThrottlingException via provider patterns", () => {
+    expect(classifyFailoverReason("ThrottlingException: Too many concurrent requests")).toBe(
+      "rate_limit",
+    );
+  });
+
+  it("classifies Groq model_deactivated via provider patterns", () => {
+    expect(classifyFailoverReason("model_is_deactivated: this model has been deactivated")).toBe(
+      "model_not_found",
+    );
+  });
+});
diff --git a/src/agents/pi-embedded-helpers/provider-error-patterns.ts b/src/agents/pi-embedded-helpers/provider-error-patterns.ts
new file mode 100644
index 00000000000..391d727c717
--- /dev/null
+++ b/src/agents/pi-embedded-helpers/provider-error-patterns.ts
@@ -0,0 +1,111 @@
+/**
+ * Provider-specific error patterns that improve failover classification accuracy.
+ *
+ * Many providers return errors in non-standard formats. Without these patterns,
+ * errors get misclassified (e.g., a context overflow classified as "format"),
+ * causing the failover engine to choose wrong recovery strategies.
+ */
+
+import type { FailoverReason } from "./types.js";
+
+type ProviderErrorPattern = {
+  /** Regex to match against the raw error message. */
+  test: RegExp;
+  /** The failover reason this pattern maps to. */
+  reason: FailoverReason;
+};
+
+/**
+ * Provider-specific context overflow patterns not covered by the generic
+ * `isContextOverflowError()` in errors.ts. Called from `isContextOverflowError()`
+ * to catch provider-specific wording that the generic regex misses.
+ */
+export const PROVIDER_CONTEXT_OVERFLOW_PATTERNS: readonly RegExp[] = [
+  // AWS Bedrock
+  /ValidationException.*(?:input is too long|max input token|input token.*exceed)/i,
+  /ValidationException.*(?:exceeds? the (?:maximum|max) (?:number of )?(?:input )?tokens)/i,
+  /ModelStreamErrorException.*(?:Input is too long|too many input tokens)/i,
+
+  // Azure OpenAI (sometimes wraps OpenAI errors differently)
+  /content_filter.*(?:prompt|input).*(?:too long|exceed)/i,
+
+  // Ollama / local models
+  /\bollama\b.*(?:context length|too many tokens|context window)/i,
+  /\btruncating input\b.*\btoo long\b/i,
+
+  // Mistral
+  /\bmistral\b.*(?:input.*too long|token limit.*exceeded)/i,
+
+  // Cohere
+  /\btotal tokens?.*exceeds? (?:the )?(?:model(?:'s)? )?(?:max|maximum|limit)/i,
+
+  // DeepSeek
+  /\bdeepseek\b.*(?:input.*too long|context.*exceed)/i,
+
+  // Google Vertex / Gemini: INVALID_ARGUMENT with token-related messages is context overflow.
+  /INVALID_ARGUMENT.*(?:exceeds? the (?:maximum|max)|input.*too (?:long|large))/i,
+
+  // Generic "input too long" pattern that isn't covered by existing checks
+  /\binput (?:is )?too long for (?:the )?model\b/i,
+];
+
+/**
+ * Provider-specific patterns that map to specific failover reasons.
+ * These handle cases where the generic classifiers in failover-matches.ts
+ * produce wrong results for specific providers.
+ */
+export const PROVIDER_SPECIFIC_PATTERNS: readonly ProviderErrorPattern[] = [
+  // AWS Bedrock: ThrottlingException is rate limit
+  {
+    test: /ThrottlingException|Too many concurrent requests/i,
+    reason: "rate_limit",
+  },
+
+  // AWS Bedrock: ModelNotReadyException (require class prefix to avoid false positives)
+  {
+    test: /ModelNotReadyException/i,
+    reason: "overloaded",
+  },
+
+  // Azure: content_policy_violation should not trigger failover
+  // (it's a content moderation rejection, not a transient error)
+
+  // Groq: model_deactivated is permanent
+  {
+    test: /model(?:_is)?_deactivated|model has been deactivated/i,
+    reason: "model_not_found",
+  },
+
+  // Together AI / Fireworks: specific rate limit messages
+  {
+    test: /\bconcurrency limit\b.*\breached\b/i,
+    reason: "rate_limit",
+  },
+
+  // Cloudflare Workers AI
+  {
+    test: /\bworkers?_ai\b.*\b(?:rate|limit|quota)\b/i,
+    reason: "rate_limit",
+  },
+];
+
+/**
+ * Check if an error message matches any provider-specific context overflow pattern.
+ * Called from `isContextOverflowError()` to catch provider-specific wording.
+ */
+export function matchesProviderContextOverflow(errorMessage: string): boolean {
+  return PROVIDER_CONTEXT_OVERFLOW_PATTERNS.some((pattern) => pattern.test(errorMessage));
+}
+
+/**
+ * Try to classify an error using provider-specific patterns.
+ * Returns null if no provider-specific pattern matches (fall through to generic classification).
+ */
+export function classifyProviderSpecificError(errorMessage: string): FailoverReason | null {
+  for (const pattern of PROVIDER_SPECIFIC_PATTERNS) {
+    if (pattern.test.test(errorMessage)) {
+      return pattern.reason;
+    }
+  }
+  return null;
+}