From 42e1d489fd370c1b35e3d46ebc8caaf84d57b3ac Mon Sep 17 00:00:00 2001 From: Extra Small Date: Fri, 3 Apr 2026 09:17:34 -0700 Subject: [PATCH] fix(auth): use shorter backoff for auth_permanent failures auth_permanent errors (e.g. API_KEY_INVALID) can be caused by transient provider outages rather than genuinely revoked credentials. Previously these used the same 5h-24h billing backoff, which left providers disabled long after the upstream issue resolved. Introduce separate authPermanentBackoffMinutes (default: 10) and authPermanentMaxMinutes (default: 60) config options so auth_permanent failures recover in minutes rather than hours. Fixes #56838 --- src/agents/auth-profiles/usage.test.ts | 6 +++--- src/agents/auth-profiles/usage.ts | 29 +++++++++++++++++++++++++- src/config/schema.help.ts | 4 ++++ src/config/schema.labels.ts | 2 ++ src/config/types.auth.ts | 10 +++++++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 6d88f45f969..9d51d54df6f 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -653,7 +653,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () label: "disabledUntil(auth_permanent)", reason: "auth_permanent" as const, buildUsageStats: (now: number): WindowStats => ({ - disabledUntil: now + 20 * 60 * 60 * 1000, + disabledUntil: now + 50 * 60 * 1000, disabledReason: "auth_permanent", errorCount: 5, failureCounts: { auth_permanent: 5 }, @@ -724,8 +724,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () lastFailureAt: now - 60_000, }), // errorCount resets, auth_permanent count resets to 1 → - // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h - expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, + // calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m + expectedUntil: (now: number) => now + 10 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, ]; diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index e77cf91dc05..5b6a7d27d79 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -516,6 +516,8 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number { type ResolvedAuthCooldownConfig = { billingBackoffMs: number; billingMaxMs: number; + authPermanentBackoffMs: number; + authPermanentMaxMs: number; failureWindowMs: number; }; @@ -556,9 +558,17 @@ function resolveAuthCooldownConfig(params: { defaults.failureWindowHours, ); + const resolveMinutes = (value: unknown, fallback: number) => + typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; + + const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10); + const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60); + return { billingBackoffMs: billingBackoffHours * 60 * 60 * 1000, billingMaxMs: billingMaxHours * 60 * 60 * 1000, + authPermanentBackoffMs: authPermanentBackoffMinutes * 60 * 1000, + authPermanentMaxMs: authPermanentMaxMinutes * 60 * 1000, failureWindowMs: failureWindowHours * 60 * 60 * 1000, }; } @@ -662,7 +672,7 @@ function computeNextProfileUsageStats(params: { lastFailureAt: params.now, }; - if (params.reason === "billing" || params.reason === "auth_permanent") { + if (params.reason === "billing") { const billingCount = failureCounts[params.reason] ?? 1; const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ errorCount: billingCount, @@ -677,6 +687,23 @@ function computeNextProfileUsageStats(params: { recomputedUntil: params.now + backoffMs, }); updatedStats.disabledReason = params.reason; + } else if (params.reason === "auth_permanent") { + // auth_permanent errors can be caused by transient provider outages (e.g. + // GCP returning API_KEY_INVALID during an incident). Use a much shorter + // backoff than billing so the provider recovers automatically once the + // upstream issue resolves. + const authPermCount = failureCounts[params.reason] ?? 1; + const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ + errorCount: authPermCount, + baseMs: params.cfgResolved.authPermanentBackoffMs, + maxMs: params.cfgResolved.authPermanentMaxMs, + }); + updatedStats.disabledUntil = keepActiveWindowOrRecompute({ + existingUntil: params.existing.disabledUntil, + now: params.now, + recomputedUntil: params.now + backoffMs, + }); + updatedStats.disabledReason = params.reason; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); // Keep active cooldown windows immutable so retries within the window diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 58368c2d181..55eedc04b9e 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -822,6 +822,10 @@ export const FIELD_HELP: Record = { "auth.cooldowns.billingBackoffHoursByProvider": "Optional per-provider overrides for billing backoff (hours).", "auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).", + "auth.cooldowns.authPermanentBackoffMinutes": + "Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.", + "auth.cooldowns.authPermanentMaxMinutes": + "Cap (minutes) for auth_permanent backoff (default: 60).", "auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).", "auth.cooldowns.overloadedProfileRotations": "Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).", diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index 6693b38ea97..99a8b9f2eb7 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -485,6 +485,8 @@ export const FIELD_LABELS: Record = { "auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)", "auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides", "auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)", + "auth.cooldowns.authPermanentBackoffMinutes": "Auth-Permanent Backoff (minutes)", + "auth.cooldowns.authPermanentMaxMinutes": "Auth-Permanent Backoff Cap (minutes)", "auth.cooldowns.failureWindowHours": "Failover Window (hours)", "auth.cooldowns.overloadedProfileRotations": "Overloaded Profile Rotations", "auth.cooldowns.overloadedBackoffMs": "Overloaded Backoff (ms)", diff --git a/src/config/types.auth.ts b/src/config/types.auth.ts index d02461ce0a8..bc00dc7d841 100644 --- a/src/config/types.auth.ts +++ b/src/config/types.auth.ts @@ -21,6 +21,16 @@ export type AuthConfig = { billingBackoffHoursByProvider?: Record; /** Billing backoff cap (hours). Default: 24. */ billingMaxHours?: number; + /** + * Base backoff for permanent-auth failures (minutes). These errors (e.g. + * API_KEY_INVALID) can be caused by transient provider outages, so the + * default is much shorter than billing backoff. Default: 10. + */ + authPermanentBackoffMinutes?: number; + /** + * Cap for permanent-auth backoff (minutes). Default: 60. + */ + authPermanentMaxMinutes?: number; /** * Failure window for backoff counters (hours). If no failures occur within * this window, counters reset. Default: 24.