From 131f6dac37a8e09da209b0913b684a17d06ff13f Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 1 Apr 2026 21:50:07 +0900 Subject: [PATCH] refactor: unify failover signal classification --- src/agents/failover-error.test.ts | 30 ++++ src/agents/failover-error.ts | 136 +++++++---------- src/agents/pi-embedded-helpers/errors.ts | 183 +++++++++++++++++------ 3 files changed, 224 insertions(+), 125 deletions(-) diff --git a/src/agents/failover-error.test.ts b/src/agents/failover-error.test.ts index f37ade88155..fcb9388932a 100644 --- a/src/agents/failover-error.test.ts +++ b/src/agents/failover-error.test.ts @@ -6,6 +6,7 @@ import { resolveFailoverReasonFromError, resolveFailoverStatus, } from "./failover-error.js"; +import { classifyFailoverSignal } from "./pi-embedded-helpers/errors.js"; // OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors const OPENAI_RATE_LIMIT_MESSAGE = @@ -222,6 +223,20 @@ describe("failover-error", () => { ).toBeNull(); }); + it("keeps context overflow first-class in the shared signal classifier", () => { + expect( + classifyFailoverSignal({ + status: 400, + message: "INVALID_ARGUMENT: input exceeds the maximum number of tokens", + }), + ).toEqual({ kind: "context_overflow" }); + expect( + classifyFailoverSignal({ + message: "prompt is too long: 150000 tokens > 128000 maximum", + }), + ).toEqual({ kind: "context_overflow" }); + }); + it("treats HTTP 422 as format error", () => { expect( resolveFailoverReasonFromError({ @@ -383,6 +398,12 @@ describe("failover-error", () => { expect(resolveFailoverReasonFromError({ code: "EPIPE" })).toBe("timeout"); }); + it("infers rate-limit and overload from symbolic error codes", () => { + expect(resolveFailoverReasonFromError({ code: "RESOURCE_EXHAUSTED" })).toBe("rate_limit"); + expect(resolveFailoverReasonFromError({ code: "THROTTLING_EXCEPTION" })).toBe("rate_limit"); + expect(resolveFailoverReasonFromError({ code: "OVERLOADED_ERROR" })).toBe("overloaded"); + }); + it("infers timeout from abort/error stop-reason messages", () => { expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe( "timeout", @@ -440,6 +461,15 @@ describe("failover-error", () => { expect(coerceToFailoverError(err)?.status).toBe(429); }); + it("lets wrapped causes override parent context-overflow classifications", () => { + const err = new Error("INVALID_ARGUMENT: input exceeds the maximum number of tokens", { + cause: { code: "RESOURCE_EXHAUSTED" }, + }); + + expect(resolveFailoverReasonFromError(err)).toBe("rate_limit"); + expect(coerceToFailoverError(err)?.reason).toBe("rate_limit"); + }); + it("coerces failover-worthy errors into FailoverError with metadata", () => { const err = coerceToFailoverError("credit balance too low", { provider: "anthropic", diff --git a/src/agents/failover-error.ts b/src/agents/failover-error.ts index dd482310a2b..12814e2d9f3 100644 --- a/src/agents/failover-error.ts +++ b/src/agents/failover-error.ts @@ -1,10 +1,10 @@ import { readErrorName } from "../infra/errors.js"; +import { isTimeoutErrorMessage, type FailoverReason } from "./pi-embedded-helpers.js"; import { - classifyFailoverReason, - classifyFailoverReasonFromHttpStatus, - isTimeoutErrorMessage, - type FailoverReason, -} from "./pi-embedded-helpers.js"; + classifyFailoverSignal, + type FailoverClassification, + type FailoverSignal, +} from "./pi-embedded-helpers/errors.js"; const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i; @@ -165,31 +165,6 @@ function getErrorCause(err: unknown): unknown { return (err as { cause?: unknown }).cause; } -/** Classify rate-limit / overloaded from symbolic error codes like RESOURCE_EXHAUSTED. */ -function classifyFailoverReasonFromSymbolicCode(raw: string | undefined): FailoverReason | null { - const normalized = raw?.trim().toUpperCase(); - if (!normalized) { - return null; - } - switch (normalized) { - case "RESOURCE_EXHAUSTED": - case "RATE_LIMIT": - case "RATE_LIMITED": - case "RATE_LIMIT_EXCEEDED": - case "TOO_MANY_REQUESTS": - case "THROTTLED": - case "THROTTLING": - case "THROTTLINGEXCEPTION": - case "THROTTLING_EXCEPTION": - return "rate_limit"; - case "OVERLOADED": - case "OVERLOADED_ERROR": - return "overloaded"; - default: - return null; - } -} - function hasTimeoutHint(err: unknown): boolean { if (!err) { return false; @@ -220,59 +195,56 @@ export function isTimeoutError(err: unknown): boolean { return hasTimeoutHint(cause) || hasTimeoutHint(reason); } -export function resolveFailoverReasonFromError(err: unknown): FailoverReason | null { - if (isFailoverError(err)) { - return err.reason; - } +function failoverReasonFromClassification( + classification: FailoverClassification | null, +): FailoverReason | null { + return classification?.kind === "reason" ? classification.reason : null; +} - const status = getStatusCode(err); +function normalizeErrorSignal(err: unknown): FailoverSignal { const message = getErrorMessage(err); - const statusReason = classifyFailoverReasonFromHttpStatus(status, message); - if (statusReason) { - return statusReason; + return { + status: getStatusCode(err), + code: getErrorCode(err), + message: message || undefined, + }; +} + +function resolveFailoverClassificationFromError(err: unknown): FailoverClassification | null { + if (isFailoverError(err)) { + return { + kind: "reason", + reason: err.reason, + }; } - // Check symbolic error codes (e.g. RESOURCE_EXHAUSTED from Google APIs) - const symbolicCodeReason = classifyFailoverReasonFromSymbolicCode(getErrorCode(err)); - if (symbolicCodeReason) { - return symbolicCodeReason; - } - - const code = (getErrorCode(err) ?? "").toUpperCase(); - if ( - [ - "ETIMEDOUT", - "ESOCKETTIMEDOUT", - "ECONNRESET", - "ECONNABORTED", - "ECONNREFUSED", - "ENETUNREACH", - "EHOSTUNREACH", - "EHOSTDOWN", - "ENETRESET", - "EPIPE", - "EAI_AGAIN", - ].includes(code) - ) { - return "timeout"; - } - // Walk into error cause chain *before* timeout heuristics so that a specific - // cause (e.g. RESOURCE_EXHAUSTED wrapped in AbortError) overrides a parent - // message-based "timeout" guess from isTimeoutError. - const cause = getErrorCause(err); - if (cause && cause !== err) { - const causeReason = resolveFailoverReasonFromError(cause); - if (causeReason) { - return causeReason; + const classification = classifyFailoverSignal(normalizeErrorSignal(err)); + if (!classification || classification.kind === "context_overflow") { + // Let wrapped causes override parent timeout/overflow guesses. + const cause = getErrorCause(err); + if (cause && cause !== err) { + const causeClassification = resolveFailoverClassificationFromError(cause); + if (causeClassification) { + return causeClassification; + } } } + + if (classification) { + return classification; + } + if (isTimeoutError(err)) { - return "timeout"; + return { + kind: "reason", + reason: "timeout", + }; } - if (!message) { - return null; - } - return classifyFailoverReason(message); + return null; +} + +export function resolveFailoverReasonFromError(err: unknown): FailoverReason | null { + return failoverReasonFromClassification(resolveFailoverClassificationFromError(err)); } export function describeFailoverError(err: unknown): { @@ -289,12 +261,13 @@ export function describeFailoverError(err: unknown): { code: err.code, }; } - const message = getErrorMessage(err) || String(err); + const signal = normalizeErrorSignal(err); + const message = signal.message ?? String(err); return { message, reason: resolveFailoverReasonFromError(err) ?? undefined, - status: getStatusCode(err), - code: getErrorCode(err), + status: signal.status, + code: signal.code, }; } @@ -314,9 +287,10 @@ export function coerceToFailoverError( return null; } - const message = getErrorMessage(err) || String(err); - const status = getStatusCode(err) ?? resolveFailoverStatus(reason); - const code = getErrorCode(err); + const signal = normalizeErrorSignal(err); + const message = signal.message ?? String(err); + const status = signal.status ?? resolveFailoverStatus(reason); + const code = signal.code; return new FailoverError(message, { reason, diff --git a/src/agents/pi-embedded-helpers/errors.ts b/src/agents/pi-embedded-helpers/errors.ts index 62c21695450..de4a89d57a0 100644 --- a/src/agents/pi-embedded-helpers/errors.ts +++ b/src/agents/pi-embedded-helpers/errors.ts @@ -364,6 +364,21 @@ const HTTP_ERROR_HINTS = [ type PaymentRequiredFailoverReason = Extract; +export type FailoverSignal = { + status?: number; + code?: string; + message?: string; +}; + +export type FailoverClassification = + | { + kind: "reason"; + reason: FailoverReason; + } + | { + kind: "context_overflow"; + }; + const BILLING_402_HINTS = [ "insufficient credits", "insufficient quota", @@ -394,6 +409,19 @@ const RAW_402_MARKER_RE = /["']?(?:status|code)["']?\s*[:=]\s*402\b|\bhttp\s*402\b|\berror(?:\s+code)?\s*[:=]?\s*402\b|\b(?:got|returned|received)\s+(?:a\s+)?402\b|^\s*402\s+payment required\b|^\s*402\s+.*used up your points\b/i; const LEADING_402_WRAPPER_RE = /^(?:error[:\s-]+)?(?:(?:http\s*)?402(?:\s+payment required)?|payment required)(?:[:\s-]+|$)/i; +const TIMEOUT_ERROR_CODES = new Set([ + "ETIMEDOUT", + "ESOCKETTIMEDOUT", + "ECONNRESET", + "ECONNABORTED", + "ECONNREFUSED", + "ENETUNREACH", + "EHOSTUNREACH", + "EHOSTDOWN", + "ENETRESET", + "EPIPE", + "EAI_AGAIN", +]); function includesAnyHint(text: string, hints: readonly string[]): boolean { return hints.some((hint) => text.includes(hint)); @@ -467,6 +495,16 @@ function classifyFailoverReasonFrom402Text(raw: string): PaymentRequiredFailover return classify402Message(raw); } +function toReasonClassification(reason: FailoverReason): FailoverClassification { + return { kind: "reason", reason }; +} + +function failoverReasonFromClassification( + classification: FailoverClassification | null, +): FailoverReason | null { + return classification?.kind === "reason" ? classification.reason : null; +} + export function isTransientHttpError(raw: string): boolean { const trimmed = raw.trim(); if (!trimmed) { @@ -483,25 +521,36 @@ export function classifyFailoverReasonFromHttpStatus( status: number | undefined, message?: string, ): FailoverReason | null { - const messageReason = message ? classifyFailoverReasonFromMessage(message) : null; + const messageClassification = message ? classifyFailoverClassificationFromMessage(message) : null; + return failoverReasonFromClassification( + classifyFailoverClassificationFromHttpStatus(status, message, messageClassification), + ); +} + +function classifyFailoverClassificationFromHttpStatus( + status: number | undefined, + message: string | undefined, + messageClassification: FailoverClassification | null, +): FailoverClassification | null { + const messageReason = failoverReasonFromClassification(messageClassification); if (typeof status !== "number" || !Number.isFinite(status)) { return null; } if (status === 402) { - return message ? classify402Message(message) : "billing"; + return toReasonClassification(message ? classify402Message(message) : "billing"); } if (status === 429) { - return "rate_limit"; + return toReasonClassification("rate_limit"); } if (status === 401 || status === 403) { if (message && isAuthPermanentErrorMessage(message)) { - return "auth_permanent"; + return toReasonClassification("auth_permanent"); } - return "auth"; + return toReasonClassification("auth"); } if (status === 408) { - return "timeout"; + return toReasonClassification("timeout"); } if (status === 410) { // HTTP 410 is only a true session-expiry signal when the payload says the @@ -514,46 +563,65 @@ export function classifyFailoverReasonFromHttpStatus( messageReason === "auth_permanent" || messageReason === "auth" ) { - return messageReason; + return messageClassification; } - return "timeout"; + return toReasonClassification("timeout"); } if (status === 503) { if (messageReason === "overloaded") { - return "overloaded"; + return messageClassification; } - return "timeout"; + return toReasonClassification("timeout"); } if (status === 499) { if (messageReason === "overloaded") { - return "overloaded"; + return messageClassification; } - return "timeout"; + return toReasonClassification("timeout"); } if (status === 500 || status === 502 || status === 504) { - return "timeout"; + return toReasonClassification("timeout"); } if (status === 529) { - return "overloaded"; + return toReasonClassification("overloaded"); } if (status === 400 || status === 422) { // 400/422 are ambiguous: inspect the payload first so provider-specific // rate limits, auth failures, model-not-found errors, and billing signals // are not collapsed into generic "format" failures. - if (messageReason) { - return messageReason; + if (messageClassification) { + return messageClassification; } - // Context overflow does not map to a FailoverReason. Keep it out of the - // generic format bucket so callers can handle compaction/reset separately. - if (message && isContextOverflowError(message)) { - return null; - } - return "format"; + return toReasonClassification("format"); } return null; } -function classifyFailoverReasonFromMessage(raw: string): FailoverReason | null { +function classifyFailoverReasonFromCode(raw: string | undefined): FailoverReason | null { + const normalized = raw?.trim().toUpperCase(); + if (!normalized) { + return null; + } + switch (normalized) { + case "RESOURCE_EXHAUSTED": + case "RATE_LIMIT": + case "RATE_LIMITED": + case "RATE_LIMIT_EXCEEDED": + case "TOO_MANY_REQUESTS": + case "THROTTLED": + case "THROTTLING": + case "THROTTLINGEXCEPTION": + case "THROTTLING_EXCEPTION": + return "rate_limit"; + case "OVERLOADED": + case "OVERLOADED_ERROR": + return "overloaded"; + default: + return TIMEOUT_ERROR_CODES.has(normalized) ? "timeout" : null; + } +} + +function classifyFailoverClassificationFromMessage(raw: string): FailoverClassification | null { if (isImageDimensionErrorMessage(raw)) { return null; } @@ -561,63 +629,89 @@ function classifyFailoverReasonFromMessage(raw: string): FailoverReason | null { return null; } if (isCliSessionExpiredErrorMessage(raw)) { - return "session_expired"; + return toReasonClassification("session_expired"); } if (isModelNotFoundErrorMessage(raw)) { - return "model_not_found"; + return toReasonClassification("model_not_found"); + } + if (isContextOverflowError(raw)) { + return { kind: "context_overflow" }; } const reasonFrom402Text = classifyFailoverReasonFrom402Text(raw); if (reasonFrom402Text) { - return reasonFrom402Text; + return toReasonClassification(reasonFrom402Text); } if (isPeriodicUsageLimitErrorMessage(raw)) { - return isBillingErrorMessage(raw) ? "billing" : "rate_limit"; + return toReasonClassification(isBillingErrorMessage(raw) ? "billing" : "rate_limit"); } if (isRateLimitErrorMessage(raw)) { - return "rate_limit"; + return toReasonClassification("rate_limit"); } if (isOverloadedErrorMessage(raw)) { - return "overloaded"; + return toReasonClassification("overloaded"); } if (isTransientHttpError(raw)) { const status = extractLeadingHttpStatus(raw.trim()); if (status?.code === 529) { - return "overloaded"; + return toReasonClassification("overloaded"); } - return "timeout"; + return toReasonClassification("timeout"); } // Billing and auth classifiers run before the broad isJsonApiInternalServerError // check so that provider errors like {"type":"api_error","message":"insufficient // balance"} are correctly classified as "billing"/"auth" rather than "timeout". if (isBillingErrorMessage(raw)) { - return "billing"; + return toReasonClassification("billing"); } if (isAuthPermanentErrorMessage(raw)) { - return "auth_permanent"; + return toReasonClassification("auth_permanent"); } if (isAuthErrorMessage(raw)) { - return "auth"; + return toReasonClassification("auth"); } if (isServerErrorMessage(raw)) { - return "timeout"; + return toReasonClassification("timeout"); } if (isJsonApiInternalServerError(raw)) { - return "timeout"; + return toReasonClassification("timeout"); } if (isCloudCodeAssistFormatError(raw)) { - return "format"; + return toReasonClassification("format"); } if (isTimeoutErrorMessage(raw)) { - return "timeout"; + return toReasonClassification("timeout"); } // Provider-specific patterns as a final catch (Bedrock, Groq, Together AI, etc.) const providerSpecific = classifyProviderSpecificError(raw); if (providerSpecific) { - return providerSpecific; + return toReasonClassification(providerSpecific); } return null; } +export function classifyFailoverSignal(signal: FailoverSignal): FailoverClassification | null { + const inferredStatus = + typeof signal.status === "number" && Number.isFinite(signal.status) + ? signal.status + : extractLeadingHttpStatus(signal.message?.trim() ?? "")?.code; + const messageClassification = signal.message + ? classifyFailoverClassificationFromMessage(signal.message) + : null; + const statusClassification = classifyFailoverClassificationFromHttpStatus( + inferredStatus, + signal.message, + messageClassification, + ); + if (statusClassification) { + return statusClassification; + } + const codeReason = classifyFailoverReasonFromCode(signal.code); + if (codeReason) { + return toReasonClassification(codeReason); + } + return messageClassification; +} + function coerceText(value: unknown): string { if (typeof value === "string") { return value; @@ -1103,11 +1197,12 @@ function isCliSessionExpiredErrorMessage(raw: string): boolean { export function classifyFailoverReason(raw: string): FailoverReason | null { const trimmed = raw.trim(); const leadingStatus = extractLeadingHttpStatus(trimmed); - const statusReason = classifyFailoverReasonFromHttpStatus(leadingStatus?.code, raw); - if (statusReason) { - return statusReason; - } - return classifyFailoverReasonFromMessage(raw); + return failoverReasonFromClassification( + classifyFailoverSignal({ + status: leadingStatus?.code, + message: raw, + }), + ); } export function isFailoverErrorMessage(raw: string): boolean {