fix(auth): use shorter backoff for auth_permanent failures

auth_permanent errors (e.g. API_KEY_INVALID) can be caused by transient
provider outages rather than genuinely revoked credentials. Previously
these used the same 5h-24h billing backoff, which left providers disabled
long after the upstream issue resolved.

Introduce separate authPermanentBackoffMinutes (default: 10) and
authPermanentMaxMinutes (default: 60) config options so auth_permanent
failures recover in minutes rather than hours.

Fixes #56838
This commit is contained in:
Extra Small 2026-04-03 09:17:34 -07:00 committed by Peter Steinberger
parent 022a24ec48
commit 42e1d489fd
5 changed files with 47 additions and 4 deletions

View File

@ -653,7 +653,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
label: "disabledUntil(auth_permanent)",
reason: "auth_permanent" as const,
buildUsageStats: (now: number): WindowStats => ({
disabledUntil: now + 20 * 60 * 60 * 1000,
disabledUntil: now + 50 * 60 * 1000,
disabledReason: "auth_permanent",
errorCount: 5,
failureCounts: { auth_permanent: 5 },
@ -724,8 +724,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
lastFailureAt: now - 60_000,
}),
// errorCount resets, auth_permanent count resets to 1 →
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
// calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m
expectedUntil: (now: number) => now + 10 * 60 * 1000,
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
},
];

View File

@ -516,6 +516,8 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
type ResolvedAuthCooldownConfig = {
billingBackoffMs: number;
billingMaxMs: number;
authPermanentBackoffMs: number;
authPermanentMaxMs: number;
failureWindowMs: number;
};
@ -556,9 +558,17 @@ function resolveAuthCooldownConfig(params: {
defaults.failureWindowHours,
);
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
authPermanentBackoffMs: authPermanentBackoffMinutes * 60 * 1000,
authPermanentMaxMs: authPermanentMaxMinutes * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
}
@ -662,7 +672,7 @@ function computeNextProfileUsageStats(params: {
lastFailureAt: params.now,
};
if (params.reason === "billing" || params.reason === "auth_permanent") {
if (params.reason === "billing") {
const billingCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
@ -677,6 +687,23 @@ function computeNextProfileUsageStats(params: {
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else if (params.reason === "auth_permanent") {
// auth_permanent errors can be caused by transient provider outages (e.g.
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
// backoff than billing so the provider recovers automatically once the
// upstream issue resolves.
const authPermCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: authPermCount,
baseMs: params.cfgResolved.authPermanentBackoffMs,
maxMs: params.cfgResolved.authPermanentMaxMs,
});
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
existingUntil: params.existing.disabledUntil,
now: params.now,
recomputedUntil: params.now + backoffMs,
});
updatedStats.disabledReason = params.reason;
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
// Keep active cooldown windows immutable so retries within the window

View File

@ -822,6 +822,10 @@ export const FIELD_HELP: Record<string, string> = {
"auth.cooldowns.billingBackoffHoursByProvider":
"Optional per-provider overrides for billing backoff (hours).",
"auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).",
"auth.cooldowns.authPermanentBackoffMinutes":
"Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.",
"auth.cooldowns.authPermanentMaxMinutes":
"Cap (minutes) for auth_permanent backoff (default: 60).",
"auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).",
"auth.cooldowns.overloadedProfileRotations":
"Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).",

View File

@ -485,6 +485,8 @@ export const FIELD_LABELS: Record<string, string> = {
"auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)",
"auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides",
"auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)",
"auth.cooldowns.authPermanentBackoffMinutes": "Auth-Permanent Backoff (minutes)",
"auth.cooldowns.authPermanentMaxMinutes": "Auth-Permanent Backoff Cap (minutes)",
"auth.cooldowns.failureWindowHours": "Failover Window (hours)",
"auth.cooldowns.overloadedProfileRotations": "Overloaded Profile Rotations",
"auth.cooldowns.overloadedBackoffMs": "Overloaded Backoff (ms)",

View File

@ -21,6 +21,16 @@ export type AuthConfig = {
billingBackoffHoursByProvider?: Record<string, number>;
/** Billing backoff cap (hours). Default: 24. */
billingMaxHours?: number;
/**
* Base backoff for permanent-auth failures (minutes). These errors (e.g.
* API_KEY_INVALID) can be caused by transient provider outages, so the
* default is much shorter than billing backoff. Default: 10.
*/
authPermanentBackoffMinutes?: number;
/**
* Cap for permanent-auth backoff (minutes). Default: 60.
*/
authPermanentMaxMinutes?: number;
/**
* Failure window for backoff counters (hours). If no failures occur within
* this window, counters reset. Default: 24.