mirror of https://github.com/openclaw/openclaw.git
fix(auth): use shorter backoff for auth_permanent failures
auth_permanent errors (e.g. API_KEY_INVALID) can be caused by transient provider outages rather than genuinely revoked credentials. Previously these used the same 5h-24h billing backoff, which left providers disabled long after the upstream issue resolved. Introduce separate authPermanentBackoffMinutes (default: 10) and authPermanentMaxMinutes (default: 60) config options so auth_permanent failures recover in minutes rather than hours. Fixes #56838
This commit is contained in:
parent
022a24ec48
commit
42e1d489fd
|
|
@ -653,7 +653,7 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||
label: "disabledUntil(auth_permanent)",
|
||||
reason: "auth_permanent" as const,
|
||||
buildUsageStats: (now: number): WindowStats => ({
|
||||
disabledUntil: now + 20 * 60 * 60 * 1000,
|
||||
disabledUntil: now + 50 * 60 * 1000,
|
||||
disabledReason: "auth_permanent",
|
||||
errorCount: 5,
|
||||
failureCounts: { auth_permanent: 5 },
|
||||
|
|
@ -724,8 +724,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
|
|||
lastFailureAt: now - 60_000,
|
||||
}),
|
||||
// errorCount resets, auth_permanent count resets to 1 →
|
||||
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
|
||||
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
|
||||
// calculateAuthProfileBillingDisableMsWithConfig(1, 10m, 60m) = 10m
|
||||
expectedUntil: (now: number) => now + 10 * 60 * 1000,
|
||||
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
|
||||
},
|
||||
];
|
||||
|
|
|
|||
|
|
@ -516,6 +516,8 @@ export function calculateAuthProfileCooldownMs(errorCount: number): number {
|
|||
type ResolvedAuthCooldownConfig = {
|
||||
billingBackoffMs: number;
|
||||
billingMaxMs: number;
|
||||
authPermanentBackoffMs: number;
|
||||
authPermanentMaxMs: number;
|
||||
failureWindowMs: number;
|
||||
};
|
||||
|
||||
|
|
@ -556,9 +558,17 @@ function resolveAuthCooldownConfig(params: {
|
|||
defaults.failureWindowHours,
|
||||
);
|
||||
|
||||
const resolveMinutes = (value: unknown, fallback: number) =>
|
||||
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
|
||||
|
||||
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
|
||||
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
|
||||
|
||||
return {
|
||||
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
|
||||
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
|
||||
authPermanentBackoffMs: authPermanentBackoffMinutes * 60 * 1000,
|
||||
authPermanentMaxMs: authPermanentMaxMinutes * 60 * 1000,
|
||||
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
|
||||
};
|
||||
}
|
||||
|
|
@ -662,7 +672,7 @@ function computeNextProfileUsageStats(params: {
|
|||
lastFailureAt: params.now,
|
||||
};
|
||||
|
||||
if (params.reason === "billing" || params.reason === "auth_permanent") {
|
||||
if (params.reason === "billing") {
|
||||
const billingCount = failureCounts[params.reason] ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: billingCount,
|
||||
|
|
@ -677,6 +687,23 @@ function computeNextProfileUsageStats(params: {
|
|||
recomputedUntil: params.now + backoffMs,
|
||||
});
|
||||
updatedStats.disabledReason = params.reason;
|
||||
} else if (params.reason === "auth_permanent") {
|
||||
// auth_permanent errors can be caused by transient provider outages (e.g.
|
||||
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
|
||||
// backoff than billing so the provider recovers automatically once the
|
||||
// upstream issue resolves.
|
||||
const authPermCount = failureCounts[params.reason] ?? 1;
|
||||
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
||||
errorCount: authPermCount,
|
||||
baseMs: params.cfgResolved.authPermanentBackoffMs,
|
||||
maxMs: params.cfgResolved.authPermanentMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
|
||||
existingUntil: params.existing.disabledUntil,
|
||||
now: params.now,
|
||||
recomputedUntil: params.now + backoffMs,
|
||||
});
|
||||
updatedStats.disabledReason = params.reason;
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
// Keep active cooldown windows immutable so retries within the window
|
||||
|
|
|
|||
|
|
@ -822,6 +822,10 @@ export const FIELD_HELP: Record<string, string> = {
|
|||
"auth.cooldowns.billingBackoffHoursByProvider":
|
||||
"Optional per-provider overrides for billing backoff (hours).",
|
||||
"auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).",
|
||||
"auth.cooldowns.authPermanentBackoffMinutes":
|
||||
"Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.",
|
||||
"auth.cooldowns.authPermanentMaxMinutes":
|
||||
"Cap (minutes) for auth_permanent backoff (default: 60).",
|
||||
"auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).",
|
||||
"auth.cooldowns.overloadedProfileRotations":
|
||||
"Maximum same-provider auth-profile rotations allowed for overloaded errors before switching to model fallback (default: 1).",
|
||||
|
|
|
|||
|
|
@ -485,6 +485,8 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||
"auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)",
|
||||
"auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides",
|
||||
"auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)",
|
||||
"auth.cooldowns.authPermanentBackoffMinutes": "Auth-Permanent Backoff (minutes)",
|
||||
"auth.cooldowns.authPermanentMaxMinutes": "Auth-Permanent Backoff Cap (minutes)",
|
||||
"auth.cooldowns.failureWindowHours": "Failover Window (hours)",
|
||||
"auth.cooldowns.overloadedProfileRotations": "Overloaded Profile Rotations",
|
||||
"auth.cooldowns.overloadedBackoffMs": "Overloaded Backoff (ms)",
|
||||
|
|
|
|||
|
|
@ -21,6 +21,16 @@ export type AuthConfig = {
|
|||
billingBackoffHoursByProvider?: Record<string, number>;
|
||||
/** Billing backoff cap (hours). Default: 24. */
|
||||
billingMaxHours?: number;
|
||||
/**
|
||||
* Base backoff for permanent-auth failures (minutes). These errors (e.g.
|
||||
* API_KEY_INVALID) can be caused by transient provider outages, so the
|
||||
* default is much shorter than billing backoff. Default: 10.
|
||||
*/
|
||||
authPermanentBackoffMinutes?: number;
|
||||
/**
|
||||
* Cap for permanent-auth backoff (minutes). Default: 60.
|
||||
*/
|
||||
authPermanentMaxMinutes?: number;
|
||||
/**
|
||||
* Failure window for backoff counters (hours). If no failures occur within
|
||||
* this window, counters reset. Default: 24.
|
||||
|
|
|
|||
Loading…
Reference in New Issue