fix: narrow auth permanent lockouts

This commit is contained in:
Peter Steinberger 2026-04-04 02:22:04 +09:00
parent 42e1d489fd
commit 865fa2ba72
13 changed files with 180 additions and 45 deletions

View File

@ -7816,6 +7816,38 @@
"help": "Cooldown/backoff controls for temporary profile suppression after billing-related failures and retry windows. Use these to prevent rapid re-selection of profiles that are still blocked.",
"hasChildren": true
},
{
"path": "auth.cooldowns.authPermanentBackoffMinutes",
"kind": "core",
"type": "number",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"reliability"
],
"label": "Auth-Permanent Backoff (minutes)",
"help": "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.",
"hasChildren": false
},
{
"path": "auth.cooldowns.authPermanentMaxMinutes",
"kind": "core",
"type": "number",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"performance"
],
"label": "Auth-Permanent Backoff Cap (minutes)",
"help": "Cap (minutes) for auth_permanent backoff (default: 60).",
"hasChildren": false
},
{
"path": "auth.cooldowns.billingBackoffHours",
"kind": "core",

View File

@ -7815,6 +7815,38 @@
"help": "Cooldown/backoff controls for temporary profile suppression after billing-related failures and retry windows. Use these to prevent rapid re-selection of profiles that are still blocked.",
"hasChildren": true
},
{
"path": "auth.cooldowns.authPermanentBackoffMinutes",
"kind": "core",
"type": "number",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"reliability"
],
"label": "Auth-Permanent Backoff (minutes)",
"help": "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.",
"hasChildren": false
},
{
"path": "auth.cooldowns.authPermanentMaxMinutes",
"kind": "core",
"type": "number",
"required": false,
"deprecated": false,
"sensitive": false,
"tags": [
"access",
"auth",
"performance"
],
"label": "Auth-Permanent Backoff Cap (minutes)",
"help": "Cap (minutes) for auth_permanent backoff (default: 60).",
"hasChildren": false
},
{
"path": "auth.cooldowns.billingBackoffHours",
"kind": "core",

View File

@ -3037,6 +3037,8 @@ Notes:
billingBackoffHours: 5,
billingBackoffHoursByProvider: { anthropic: 3, openai: 8 },
billingMaxHours: 24,
authPermanentBackoffMinutes: 10,
authPermanentMaxMinutes: 60,
failureWindowHours: 24,
overloadedProfileRotations: 1,
overloadedBackoffMs: 0,
@ -3049,6 +3051,8 @@ Notes:
- `billingBackoffHours`: base backoff in hours when a profile fails due to billing/insufficient credits (default: `5`).
- `billingBackoffHoursByProvider`: optional per-provider overrides for billing backoff hours.
- `billingMaxHours`: cap in hours for billing backoff exponential growth (default: `24`).
- `authPermanentBackoffMinutes`: base backoff in minutes for high-confidence `auth_permanent` failures (default: `10`).
- `authPermanentMaxMinutes`: cap in minutes for `auth_permanent` backoff growth (default: `60`).
- `failureWindowHours`: rolling window in hours used for backoff counters (default: `24`).
- `overloadedProfileRotations`: maximum same-provider auth-profile rotations for overloaded errors before switching to model fallback (default: `1`).
- `overloadedBackoffMs`: fixed delay before retrying an overloaded provider/profile rotation (default: `0`).

View File

@ -199,8 +199,9 @@ describe("markAuthProfileFailure", () => {
expect(stats?.failureCounts?.overloaded).toBe(1);
});
});
it("disables auth_permanent failures via disabledUntil (like billing)", async () => {
it("disables auth_permanent failures for ~10 minutes by default", async () => {
await withAuthProfileStore(async ({ agentDir, store }) => {
const startedAt = Date.now();
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
@ -213,6 +214,33 @@ describe("markAuthProfileFailure", () => {
expect(stats?.disabledReason).toBe("auth_permanent");
// Should NOT set cooldownUntil (that's for transient errors)
expect(stats?.cooldownUntil).toBeUndefined();
const remainingMs = (stats?.disabledUntil as number) - startedAt;
expectCooldownInRange(remainingMs, 9 * 60 * 1000, 11 * 60 * 1000);
});
});
it("honors auth_permanent backoff overrides", async () => {
await withAuthProfileStore(async ({ agentDir, store }) => {
const startedAt = Date.now();
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "auth_permanent",
agentDir,
cfg: {
auth: {
cooldowns: {
authPermanentBackoffMinutes: 15,
authPermanentMaxMinutes: 45,
},
},
} as never,
});
const disabledUntil = store.usageStats?.["anthropic:default"]?.disabledUntil;
expect(typeof disabledUntil).toBe("number");
const remainingMs = (disabledUntil as number) - startedAt;
expectCooldownInRange(remainingMs, 14 * 60 * 1000, 16 * 60 * 1000);
});
});
it("resets backoff counters outside the failure window", async () => {

View File

@ -130,10 +130,7 @@ function applyWhamCooldownResult(params: {
: 0;
return {
...params.computed,
cooldownUntil: Math.max(
existingActiveCooldownUntil,
params.now + params.whamResult.cooldownMs,
),
cooldownUntil: Math.max(existingActiveCooldownUntil, params.now + params.whamResult.cooldownMs),
};
}
@ -528,11 +525,15 @@ function resolveAuthCooldownConfig(params: {
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
authPermanentBackoffMinutes: 10,
authPermanentMaxMinutes: 60,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
const billingOverride = (() => {
@ -553,17 +554,19 @@ function resolveAuthCooldownConfig(params: {
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
const authPermanentBackoffMinutes = resolveMinutes(
cooldowns?.authPermanentBackoffMinutes,
defaults.authPermanentBackoffMinutes,
);
const authPermanentMaxMinutes = resolveMinutes(
cooldowns?.authPermanentMaxMinutes,
defaults.authPermanentMaxMinutes,
);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
const resolveMinutes = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const authPermanentBackoffMinutes = resolveMinutes(cooldowns?.authPermanentBackoffMinutes, 10);
const authPermanentMaxMinutes = resolveMinutes(cooldowns?.authPermanentMaxMinutes, 60);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
@ -688,13 +691,12 @@ function computeNextProfileUsageStats(params: {
});
updatedStats.disabledReason = params.reason;
} else if (params.reason === "auth_permanent") {
// auth_permanent errors can be caused by transient provider outages (e.g.
// GCP returning API_KEY_INVALID during an incident). Use a much shorter
// backoff than billing so the provider recovers automatically once the
// upstream issue resolves.
const authPermCount = failureCounts[params.reason] ?? 1;
// Keep permanent-auth failures in the disabled lane, but use a much
// shorter backoff than billing. Some upstream incidents surface auth-ish
// payloads transiently, so the provider should recover automatically.
const authPermanentCount = failureCounts[params.reason] ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: authPermCount,
errorCount: authPermanentCount,
baseMs: params.cfgResolved.authPermanentBackoffMs,
maxMs: params.cfgResolved.authPermanentMaxMs,
});

View File

@ -500,9 +500,9 @@ describe("failover-error", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");
});
it("401 with permanent auth message returns auth_permanent", () => {
it("401 with ambiguous auth message returns auth", () => {
expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(
"auth_permanent",
"auth",
);
});
@ -516,26 +516,22 @@ describe("failover-error", () => {
expect(resolveFailoverStatus("auth_permanent")).toBe(403);
});
it("coerces permanent auth error with correct reason", () => {
it("coerces ambiguous auth error into the short auth lane", () => {
const err = coerceToFailoverError(
{ status: 401, message: "invalid_api_key" },
{ provider: "anthropic", model: "claude-opus-4-6" },
);
expect(err?.reason).toBe("auth_permanent");
expect(err?.reason).toBe("auth");
expect(err?.provider).toBe("anthropic");
});
it("403 permission_error returns auth_permanent", () => {
expect(
resolveFailoverReasonFromError({
status: 403,
message:
"permission_error: OAuth authentication is currently not allowed for this organization.",
}),
).toBe("auth_permanent");
it("403 bare permission_error returns auth", () => {
expect(resolveFailoverReasonFromError({ status: 403, message: "permission_error" })).toBe(
"auth",
);
});
it("permission_error in error message string classifies as auth_permanent", () => {
it("permission_error with organization denial stays auth_permanent", () => {
const err = coerceToFailoverError(
"HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",
{ provider: "anthropic", model: "claude-opus-4-6" },

View File

@ -528,8 +528,8 @@ describe("isTransientHttpError", () => {
});
describe("classifyFailoverReasonFromHttpStatus", () => {
it("treats HTTP 401 permanent auth failures as auth_permanent", () => {
expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth_permanent");
it("treats HTTP 401 invalid_api_key as ambiguous auth", () => {
expect(classifyFailoverReasonFromHttpStatus(401, "invalid_api_key")).toBe("auth");
});
it("treats HTTP 422 as format error", () => {
@ -591,7 +591,7 @@ describe("classifyFailoverReasonFromHttpStatus", () => {
});
it("preserves explicit billing and auth signals on HTTP 410", () => {
expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth_permanent");
expect(classifyFailoverReasonFromHttpStatus(410, "invalid_api_key")).toBe("auth");
expect(classifyFailoverReasonFromHttpStatus(410, "authentication failed")).toBe("auth");
expect(classifyFailoverReasonFromHttpStatus(410, "insufficient credits")).toBe("billing");
});
@ -613,7 +613,7 @@ describe("classifyFailoverReason", () => {
});
it("keeps explicit billing and auth signals on 410 text", () => {
expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth_permanent");
expect(classifyFailoverReason("HTTP 410: invalid_api_key")).toBe("auth");
expect(classifyFailoverReason("HTTP 410: authentication failed")).toBe("auth");
expect(classifyFailoverReason("HTTP 410: insufficient credits")).toBe("billing");
});
@ -935,11 +935,15 @@ describe("classifyFailoverReason", () => {
expect(classifyFailoverReason("LLM error: monthly limit reached")).toBe("rate_limit");
expect(classifyFailoverReason("LLM error: daily limit exceeded")).toBe("rate_limit");
});
it("classifies permanent auth errors as auth_permanent", () => {
expect(classifyFailoverReason("invalid_api_key")).toBe("auth_permanent");
it("keeps only high-confidence auth failures in auth_permanent", () => {
expect(classifyFailoverReason("invalid_api_key")).toBe("auth");
expect(classifyFailoverReason("permission_error")).toBe("auth");
expect(classifyFailoverReason("Your api key has been revoked")).toBe("auth_permanent");
expect(classifyFailoverReason("key has been disabled")).toBe("auth_permanent");
expect(classifyFailoverReason("account has been deactivated")).toBe("auth_permanent");
expect(
classifyFailoverReason("OAuth authentication is currently not allowed for this organization"),
).toBe("auth_permanent");
});
it("classifies JSON api_error with transient signal as timeout", () => {
expect(
@ -1013,6 +1017,11 @@ describe("classifyFailoverReason", () => {
classifyFailoverReason(
'{"type":"error","error":{"type":"api_error","message":"permission_error"}}',
),
).toBe("auth");
expect(
classifyFailoverReason(
'{"type":"error","error":{"type":"api_error","message":"permission_error: OAuth authentication is currently not allowed for this organization"}}',
),
).toBe("auth_permanent");
});
});

View File

@ -81,13 +81,10 @@ const ERROR_PATTERNS = {
/requires?\s+more\s+credits/i,
],
authPermanent: [
/api[_ ]?key[_ ]?(?:revoked|invalid|deactivated|deleted)/i,
"invalid_api_key",
/api[_ ]?key[_ ]?(?:revoked|deactivated|deleted)/i,
"key has been disabled",
"key has been revoked",
"account has been deactivated",
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
"not allowed for this organization",
],
auth: [
@ -97,6 +94,8 @@ const ERROR_PATTERNS = {
"authentication",
"re-authenticate",
"oauth token refresh failed",
/could not (?:authenticate|validate).*(?:api[_ ]?key|credentials)/i,
"permission_error",
"unauthorized",
"forbidden",
"access denied",

View File

@ -52,6 +52,20 @@ describe("plugins.slots.contextEngine", () => {
});
});
describe("auth.cooldowns auth_permanent backoff config", () => {
it("accepts auth_permanent backoff knobs", () => {
const result = OpenClawSchema.safeParse({
auth: {
cooldowns: {
authPermanentBackoffMinutes: 10,
authPermanentMaxMinutes: 60,
},
},
});
expect(result.success).toBe(true);
});
});
describe("ui.seamColor", () => {
it("accepts hex colors", () => {
const res = validateConfigObject({ ui: { seamColor: "#FF4500" } });

View File

@ -782,6 +782,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA = {
type: "number",
exclusiveMinimum: 0,
},
authPermanentBackoffMinutes: {
type: "number",
exclusiveMinimum: 0,
},
authPermanentMaxMinutes: {
type: "number",
exclusiveMinimum: 0,
},
failureWindowHours: {
type: "number",
exclusiveMinimum: 0,
@ -22567,6 +22575,16 @@ export const GENERATED_BASE_CONFIG_SCHEMA = {
help: "Cap (hours) for billing backoff (default: 24).",
tags: ["auth", "access", "performance"],
},
"auth.cooldowns.authPermanentBackoffMinutes": {
label: "Auth-Permanent Backoff (minutes)",
help: "Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.",
tags: ["auth", "access", "reliability"],
},
"auth.cooldowns.authPermanentMaxMinutes": {
label: "Auth-Permanent Backoff Cap (minutes)",
help: "Cap (minutes) for auth_permanent backoff (default: 60).",
tags: ["auth", "access", "performance"],
},
"auth.cooldowns.failureWindowHours": {
label: "Failover Window (hours)",
help: "Failure window (hours) for backoff counters (default: 24).",

View File

@ -823,7 +823,7 @@ export const FIELD_HELP: Record<string, string> = {
"Optional per-provider overrides for billing backoff (hours).",
"auth.cooldowns.billingMaxHours": "Cap (hours) for billing backoff (default: 24).",
"auth.cooldowns.authPermanentBackoffMinutes":
"Base backoff (minutes) for auth_permanent failures (default: 10). Shorter than billing because these errors are often caused by transient provider outages.",
"Base backoff (minutes) for high-confidence auth_permanent failures (default: 10). Keep this shorter than billing so providers recover automatically after transient upstream auth incidents.",
"auth.cooldowns.authPermanentMaxMinutes":
"Cap (minutes) for auth_permanent backoff (default: 60).",
"auth.cooldowns.failureWindowHours": "Failure window (hours) for backoff counters (default: 24).",

View File

@ -22,13 +22,12 @@ export type AuthConfig = {
/** Billing backoff cap (hours). Default: 24. */
billingMaxHours?: number;
/**
* Base backoff for permanent-auth failures (minutes). These errors (e.g.
* API_KEY_INVALID) can be caused by transient provider outages, so the
* default is much shorter than billing backoff. Default: 10.
* Base backoff for high-confidence permanent-auth failures (minutes).
* Default: 10.
*/
authPermanentBackoffMinutes?: number;
/**
* Cap for permanent-auth backoff (minutes). Default: 60.
* Cap for high-confidence permanent-auth backoff (minutes). Default: 60.
*/
authPermanentMaxMinutes?: number;
/**

View File

@ -443,6 +443,8 @@ export const OpenClawSchema = z
billingBackoffHours: z.number().positive().optional(),
billingBackoffHoursByProvider: z.record(z.string(), z.number().positive()).optional(),
billingMaxHours: z.number().positive().optional(),
authPermanentBackoffMinutes: z.number().positive().optional(),
authPermanentMaxMinutes: z.number().positive().optional(),
failureWindowHours: z.number().positive().optional(),
overloadedProfileRotations: z.number().int().nonnegative().optional(),
overloadedBackoffMs: z.number().int().nonnegative().optional(),