mirror of https://github.com/openclaw/openclaw.git
606 lines
19 KiB
TypeScript
606 lines
19 KiB
TypeScript
import type { OpenClawConfig } from "../../config/config.js";
|
|
import { normalizeProviderId } from "../model-selection.js";
|
|
import { logAuthProfileFailureStateChange } from "./state-observation.js";
|
|
import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
|
|
import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
|
|
|
|
const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
|
|
"auth_permanent",
|
|
"auth",
|
|
"billing",
|
|
"format",
|
|
"model_not_found",
|
|
"overloaded",
|
|
"timeout",
|
|
"rate_limit",
|
|
"unknown",
|
|
];
|
|
const FAILURE_REASON_SET = new Set<AuthProfileFailureReason>(FAILURE_REASON_PRIORITY);
|
|
const FAILURE_REASON_ORDER = new Map<AuthProfileFailureReason, number>(
|
|
FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]),
|
|
);
|
|
|
|
function isAuthCooldownBypassedForProvider(provider: string | undefined): boolean {
|
|
const normalized = normalizeProviderId(provider ?? "");
|
|
return normalized === "openrouter" || normalized === "kilocode";
|
|
}
|
|
|
|
export function resolveProfileUnusableUntil(
|
|
stats: Pick<ProfileUsageStats, "cooldownUntil" | "disabledUntil">,
|
|
): number | null {
|
|
const values = [stats.cooldownUntil, stats.disabledUntil]
|
|
.filter((value): value is number => typeof value === "number")
|
|
.filter((value) => Number.isFinite(value) && value > 0);
|
|
if (values.length === 0) {
|
|
return null;
|
|
}
|
|
return Math.max(...values);
|
|
}
|
|
|
|
/**
|
|
* Check if a profile is currently in cooldown (due to rate limits, overload, or other transient failures).
|
|
*/
|
|
export function isProfileInCooldown(
|
|
store: AuthProfileStore,
|
|
profileId: string,
|
|
now?: number,
|
|
): boolean {
|
|
if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) {
|
|
return false;
|
|
}
|
|
const stats = store.usageStats?.[profileId];
|
|
if (!stats) {
|
|
return false;
|
|
}
|
|
const unusableUntil = resolveProfileUnusableUntil(stats);
|
|
const ts = now ?? Date.now();
|
|
return unusableUntil ? ts < unusableUntil : false;
|
|
}
|
|
|
|
function isActiveUnusableWindow(until: number | undefined, now: number): boolean {
|
|
return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until;
|
|
}
|
|
|
|
/**
|
|
* Infer the most likely reason all candidate profiles are currently unavailable.
|
|
*
|
|
* We prefer explicit active `disabledReason` values (for example billing/auth)
|
|
* over generic cooldown buckets, then fall back to failure-count signals.
|
|
*/
|
|
export function resolveProfilesUnavailableReason(params: {
|
|
store: AuthProfileStore;
|
|
profileIds: string[];
|
|
now?: number;
|
|
}): AuthProfileFailureReason | null {
|
|
const now = params.now ?? Date.now();
|
|
const scores = new Map<AuthProfileFailureReason, number>();
|
|
const addScore = (reason: AuthProfileFailureReason, value: number) => {
|
|
if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) {
|
|
return;
|
|
}
|
|
scores.set(reason, (scores.get(reason) ?? 0) + value);
|
|
};
|
|
|
|
for (const profileId of params.profileIds) {
|
|
const stats = params.store.usageStats?.[profileId];
|
|
if (!stats) {
|
|
continue;
|
|
}
|
|
|
|
const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now);
|
|
if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) {
|
|
// Disabled reasons are explicit and high-signal; weight heavily.
|
|
addScore(stats.disabledReason, 1_000);
|
|
continue;
|
|
}
|
|
|
|
const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now);
|
|
if (!cooldownActive) {
|
|
continue;
|
|
}
|
|
|
|
let recordedReason = false;
|
|
for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) {
|
|
const reason = rawReason as AuthProfileFailureReason;
|
|
const count = typeof rawCount === "number" ? rawCount : 0;
|
|
if (!FAILURE_REASON_SET.has(reason) || count <= 0) {
|
|
continue;
|
|
}
|
|
addScore(reason, count);
|
|
recordedReason = true;
|
|
}
|
|
if (!recordedReason) {
|
|
// No failure counts recorded for this cooldown window. Previously this
|
|
// defaulted to "rate_limit", which caused false "rate limit reached"
|
|
// warnings when the actual reason was unknown (e.g. transient network
|
|
// blip or server error without a classified failure count).
|
|
addScore("unknown", 1);
|
|
}
|
|
}
|
|
|
|
if (scores.size === 0) {
|
|
return null;
|
|
}
|
|
|
|
let best: AuthProfileFailureReason | null = null;
|
|
let bestScore = -1;
|
|
let bestPriority = Number.MAX_SAFE_INTEGER;
|
|
for (const reason of FAILURE_REASON_PRIORITY) {
|
|
const score = scores.get(reason);
|
|
if (typeof score !== "number") {
|
|
continue;
|
|
}
|
|
const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER;
|
|
if (score > bestScore || (score === bestScore && priority < bestPriority)) {
|
|
best = reason;
|
|
bestScore = score;
|
|
bestPriority = priority;
|
|
}
|
|
}
|
|
return best;
|
|
}
|
|
|
|
/**
|
|
* Return the soonest `unusableUntil` timestamp (ms epoch) among the given
|
|
* profiles, or `null` when no profile has a recorded cooldown. Note: the
|
|
* returned timestamp may be in the past if the cooldown has already expired.
|
|
*/
|
|
export function getSoonestCooldownExpiry(
|
|
store: AuthProfileStore,
|
|
profileIds: string[],
|
|
): number | null {
|
|
let soonest: number | null = null;
|
|
for (const id of profileIds) {
|
|
const stats = store.usageStats?.[id];
|
|
if (!stats) {
|
|
continue;
|
|
}
|
|
const until = resolveProfileUnusableUntil(stats);
|
|
if (typeof until !== "number" || !Number.isFinite(until) || until <= 0) {
|
|
continue;
|
|
}
|
|
if (soonest === null || until < soonest) {
|
|
soonest = until;
|
|
}
|
|
}
|
|
return soonest;
|
|
}
|
|
|
|
/**
|
|
* Clear expired cooldowns from all profiles in the store.
|
|
*
|
|
* When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields
|
|
* are removed and error counters are reset so the profile gets a fresh start
|
|
* (circuit-breaker half-open → closed). Without this, a stale `errorCount`
|
|
* causes the *next* transient failure to immediately escalate to a much longer
|
|
* cooldown — the root cause of profiles appearing "stuck" after rate limits.
|
|
*
|
|
* `cooldownUntil` and `disabledUntil` are handled independently: if a profile
|
|
* has both and only one has expired, only that field is cleared.
|
|
*
|
|
* Mutates the in-memory store; disk persistence happens lazily on the next
|
|
* store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which
|
|
* matches the existing save pattern throughout the auth-profiles module.
|
|
*
|
|
* @returns `true` if any profile was modified.
|
|
*/
|
|
export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean {
|
|
const usageStats = store.usageStats;
|
|
if (!usageStats) {
|
|
return false;
|
|
}
|
|
|
|
const ts = now ?? Date.now();
|
|
let mutated = false;
|
|
|
|
for (const [profileId, stats] of Object.entries(usageStats)) {
|
|
if (!stats) {
|
|
continue;
|
|
}
|
|
|
|
let profileMutated = false;
|
|
const cooldownExpired =
|
|
typeof stats.cooldownUntil === "number" &&
|
|
Number.isFinite(stats.cooldownUntil) &&
|
|
stats.cooldownUntil > 0 &&
|
|
ts >= stats.cooldownUntil;
|
|
const disabledExpired =
|
|
typeof stats.disabledUntil === "number" &&
|
|
Number.isFinite(stats.disabledUntil) &&
|
|
stats.disabledUntil > 0 &&
|
|
ts >= stats.disabledUntil;
|
|
|
|
if (cooldownExpired) {
|
|
stats.cooldownUntil = undefined;
|
|
profileMutated = true;
|
|
}
|
|
if (disabledExpired) {
|
|
stats.disabledUntil = undefined;
|
|
stats.disabledReason = undefined;
|
|
profileMutated = true;
|
|
}
|
|
|
|
// Reset error counters when ALL cooldowns have expired so the profile gets
|
|
// a fair retry window. Preserves lastFailureAt for the failureWindowMs
|
|
// decay check in computeNextProfileUsageStats.
|
|
if (profileMutated && !resolveProfileUnusableUntil(stats)) {
|
|
stats.errorCount = 0;
|
|
stats.failureCounts = undefined;
|
|
}
|
|
|
|
if (profileMutated) {
|
|
usageStats[profileId] = stats;
|
|
mutated = true;
|
|
}
|
|
}
|
|
|
|
return mutated;
|
|
}
|
|
|
|
/**
|
|
* Mark a profile as successfully used. Resets error count and updates lastUsed.
|
|
* Uses store lock to avoid overwriting concurrent usage updates.
|
|
*/
|
|
export async function markAuthProfileUsed(params: {
|
|
store: AuthProfileStore;
|
|
profileId: string;
|
|
agentDir?: string;
|
|
}): Promise<void> {
|
|
const { store, profileId, agentDir } = params;
|
|
const updated = await updateAuthProfileStoreWithLock({
|
|
agentDir,
|
|
updater: (freshStore) => {
|
|
if (!freshStore.profiles[profileId]) {
|
|
return false;
|
|
}
|
|
updateUsageStatsEntry(freshStore, profileId, (existing) =>
|
|
resetUsageStats(existing, { lastUsed: Date.now() }),
|
|
);
|
|
return true;
|
|
},
|
|
});
|
|
if (updated) {
|
|
store.usageStats = updated.usageStats;
|
|
return;
|
|
}
|
|
if (!store.profiles[profileId]) {
|
|
return;
|
|
}
|
|
|
|
updateUsageStatsEntry(store, profileId, (existing) =>
|
|
resetUsageStats(existing, { lastUsed: Date.now() }),
|
|
);
|
|
saveAuthProfileStore(store, agentDir);
|
|
}
|
|
|
|
export function calculateAuthProfileCooldownMs(errorCount: number): number {
|
|
const normalized = Math.max(1, errorCount);
|
|
return Math.min(
|
|
60 * 60 * 1000, // 1 hour max
|
|
60 * 1000 * 5 ** Math.min(normalized - 1, 3),
|
|
);
|
|
}
|
|
|
|
type ResolvedAuthCooldownConfig = {
|
|
billingBackoffMs: number;
|
|
billingMaxMs: number;
|
|
failureWindowMs: number;
|
|
};
|
|
|
|
function resolveAuthCooldownConfig(params: {
|
|
cfg?: OpenClawConfig;
|
|
providerId: string;
|
|
}): ResolvedAuthCooldownConfig {
|
|
const defaults = {
|
|
billingBackoffHours: 5,
|
|
billingMaxHours: 24,
|
|
failureWindowHours: 24,
|
|
} as const;
|
|
|
|
const resolveHours = (value: unknown, fallback: number) =>
|
|
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
|
|
|
|
const cooldowns = params.cfg?.auth?.cooldowns;
|
|
const billingOverride = (() => {
|
|
const map = cooldowns?.billingBackoffHoursByProvider;
|
|
if (!map) {
|
|
return undefined;
|
|
}
|
|
for (const [key, value] of Object.entries(map)) {
|
|
if (normalizeProviderId(key) === params.providerId) {
|
|
return value;
|
|
}
|
|
}
|
|
return undefined;
|
|
})();
|
|
|
|
const billingBackoffHours = resolveHours(
|
|
billingOverride ?? cooldowns?.billingBackoffHours,
|
|
defaults.billingBackoffHours,
|
|
);
|
|
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
|
|
const failureWindowHours = resolveHours(
|
|
cooldowns?.failureWindowHours,
|
|
defaults.failureWindowHours,
|
|
);
|
|
|
|
return {
|
|
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
|
|
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
|
|
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
|
|
};
|
|
}
|
|
|
|
function calculateAuthProfileBillingDisableMsWithConfig(params: {
|
|
errorCount: number;
|
|
baseMs: number;
|
|
maxMs: number;
|
|
}): number {
|
|
const normalized = Math.max(1, params.errorCount);
|
|
const baseMs = Math.max(60_000, params.baseMs);
|
|
const maxMs = Math.max(baseMs, params.maxMs);
|
|
const exponent = Math.min(normalized - 1, 10);
|
|
const raw = baseMs * 2 ** exponent;
|
|
return Math.min(maxMs, raw);
|
|
}
|
|
|
|
export function resolveProfileUnusableUntilForDisplay(
|
|
store: AuthProfileStore,
|
|
profileId: string,
|
|
): number | null {
|
|
if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) {
|
|
return null;
|
|
}
|
|
const stats = store.usageStats?.[profileId];
|
|
if (!stats) {
|
|
return null;
|
|
}
|
|
return resolveProfileUnusableUntil(stats);
|
|
}
|
|
|
|
function resetUsageStats(
|
|
existing: ProfileUsageStats | undefined,
|
|
overrides?: Partial<ProfileUsageStats>,
|
|
): ProfileUsageStats {
|
|
return {
|
|
...existing,
|
|
errorCount: 0,
|
|
cooldownUntil: undefined,
|
|
disabledUntil: undefined,
|
|
disabledReason: undefined,
|
|
failureCounts: undefined,
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
function updateUsageStatsEntry(
|
|
store: AuthProfileStore,
|
|
profileId: string,
|
|
updater: (existing: ProfileUsageStats | undefined) => ProfileUsageStats,
|
|
): void {
|
|
store.usageStats = store.usageStats ?? {};
|
|
store.usageStats[profileId] = updater(store.usageStats[profileId]);
|
|
}
|
|
|
|
function keepActiveWindowOrRecompute(params: {
|
|
existingUntil: number | undefined;
|
|
now: number;
|
|
recomputedUntil: number;
|
|
}): number {
|
|
const { existingUntil, now, recomputedUntil } = params;
|
|
const hasActiveWindow =
|
|
typeof existingUntil === "number" && Number.isFinite(existingUntil) && existingUntil > now;
|
|
return hasActiveWindow ? existingUntil : recomputedUntil;
|
|
}
|
|
|
|
function computeNextProfileUsageStats(params: {
|
|
existing: ProfileUsageStats;
|
|
now: number;
|
|
reason: AuthProfileFailureReason;
|
|
cfgResolved: ResolvedAuthCooldownConfig;
|
|
}): ProfileUsageStats {
|
|
const windowMs = params.cfgResolved.failureWindowMs;
|
|
const windowExpired =
|
|
typeof params.existing.lastFailureAt === "number" &&
|
|
params.existing.lastFailureAt > 0 &&
|
|
params.now - params.existing.lastFailureAt > windowMs;
|
|
|
|
// If the previous cooldown has already expired, reset error counters so the
|
|
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
|
|
// in-memory during profile ordering, but the on-disk state may still carry
|
|
// the old counters when the lock-based updater reads a fresh store. Without
|
|
// this check, stale error counts from an expired cooldown cause the next
|
|
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
|
|
const unusableUntil = resolveProfileUnusableUntil(params.existing);
|
|
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
|
|
|
|
const shouldResetCounters = windowExpired || previousCooldownExpired;
|
|
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
|
|
const nextErrorCount = baseErrorCount + 1;
|
|
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
|
|
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
|
|
|
|
const updatedStats: ProfileUsageStats = {
|
|
...params.existing,
|
|
errorCount: nextErrorCount,
|
|
failureCounts,
|
|
lastFailureAt: params.now,
|
|
};
|
|
|
|
if (params.reason === "billing" || params.reason === "auth_permanent") {
|
|
const billingCount = failureCounts[params.reason] ?? 1;
|
|
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
|
|
errorCount: billingCount,
|
|
baseMs: params.cfgResolved.billingBackoffMs,
|
|
maxMs: params.cfgResolved.billingMaxMs,
|
|
});
|
|
// Keep active disable windows immutable so retries within the window cannot
|
|
// extend recovery time indefinitely.
|
|
updatedStats.disabledUntil = keepActiveWindowOrRecompute({
|
|
existingUntil: params.existing.disabledUntil,
|
|
now: params.now,
|
|
recomputedUntil: params.now + backoffMs,
|
|
});
|
|
updatedStats.disabledReason = params.reason;
|
|
} else {
|
|
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
|
// Keep active cooldown windows immutable so retries within the window
|
|
// cannot push recovery further out.
|
|
updatedStats.cooldownUntil = keepActiveWindowOrRecompute({
|
|
existingUntil: params.existing.cooldownUntil,
|
|
now: params.now,
|
|
recomputedUntil: params.now + backoffMs,
|
|
});
|
|
}
|
|
|
|
return updatedStats;
|
|
}
|
|
|
|
/**
|
|
* Mark a profile as failed for a specific reason. Billing and permanent-auth
|
|
* failures are treated as "disabled" (longer backoff) vs the regular cooldown
|
|
* window.
|
|
*/
|
|
export async function markAuthProfileFailure(params: {
|
|
store: AuthProfileStore;
|
|
profileId: string;
|
|
reason: AuthProfileFailureReason;
|
|
cfg?: OpenClawConfig;
|
|
agentDir?: string;
|
|
runId?: string;
|
|
}): Promise<void> {
|
|
const { store, profileId, reason, agentDir, cfg, runId } = params;
|
|
const profile = store.profiles[profileId];
|
|
if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
|
|
return;
|
|
}
|
|
let nextStats: ProfileUsageStats | undefined;
|
|
let previousStats: ProfileUsageStats | undefined;
|
|
let updateTime = 0;
|
|
const updated = await updateAuthProfileStoreWithLock({
|
|
agentDir,
|
|
updater: (freshStore) => {
|
|
const profile = freshStore.profiles[profileId];
|
|
if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
|
|
return false;
|
|
}
|
|
const now = Date.now();
|
|
const providerKey = normalizeProviderId(profile.provider);
|
|
const cfgResolved = resolveAuthCooldownConfig({
|
|
cfg,
|
|
providerId: providerKey,
|
|
});
|
|
|
|
previousStats = freshStore.usageStats?.[profileId];
|
|
updateTime = now;
|
|
const computed = computeNextProfileUsageStats({
|
|
existing: previousStats ?? {},
|
|
now,
|
|
reason,
|
|
cfgResolved,
|
|
});
|
|
nextStats = computed;
|
|
updateUsageStatsEntry(freshStore, profileId, () => computed);
|
|
return true;
|
|
},
|
|
});
|
|
if (updated) {
|
|
store.usageStats = updated.usageStats;
|
|
if (nextStats) {
|
|
logAuthProfileFailureStateChange({
|
|
runId,
|
|
profileId,
|
|
provider: profile.provider,
|
|
reason,
|
|
previous: previousStats,
|
|
next: nextStats,
|
|
now: updateTime,
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
if (!store.profiles[profileId]) {
|
|
return;
|
|
}
|
|
|
|
const now = Date.now();
|
|
const providerKey = normalizeProviderId(store.profiles[profileId]?.provider ?? "");
|
|
const cfgResolved = resolveAuthCooldownConfig({
|
|
cfg,
|
|
providerId: providerKey,
|
|
});
|
|
|
|
previousStats = store.usageStats?.[profileId];
|
|
const computed = computeNextProfileUsageStats({
|
|
existing: previousStats ?? {},
|
|
now,
|
|
reason,
|
|
cfgResolved,
|
|
});
|
|
nextStats = computed;
|
|
updateUsageStatsEntry(store, profileId, () => computed);
|
|
saveAuthProfileStore(store, agentDir);
|
|
logAuthProfileFailureStateChange({
|
|
runId,
|
|
profileId,
|
|
provider: store.profiles[profileId]?.provider ?? profile.provider,
|
|
reason,
|
|
previous: previousStats,
|
|
next: nextStats,
|
|
now,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Mark a profile as transiently failed. Applies exponential backoff cooldown.
|
|
* Cooldown times: 1min, 5min, 25min, max 1 hour.
|
|
* Uses store lock to avoid overwriting concurrent usage updates.
|
|
*/
|
|
export async function markAuthProfileCooldown(params: {
|
|
store: AuthProfileStore;
|
|
profileId: string;
|
|
agentDir?: string;
|
|
runId?: string;
|
|
}): Promise<void> {
|
|
await markAuthProfileFailure({
|
|
store: params.store,
|
|
profileId: params.profileId,
|
|
reason: "unknown",
|
|
agentDir: params.agentDir,
|
|
runId: params.runId,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Clear cooldown for a profile (e.g., manual reset).
|
|
* Uses store lock to avoid overwriting concurrent usage updates.
|
|
*/
|
|
export async function clearAuthProfileCooldown(params: {
|
|
store: AuthProfileStore;
|
|
profileId: string;
|
|
agentDir?: string;
|
|
}): Promise<void> {
|
|
const { store, profileId, agentDir } = params;
|
|
const updated = await updateAuthProfileStoreWithLock({
|
|
agentDir,
|
|
updater: (freshStore) => {
|
|
if (!freshStore.usageStats?.[profileId]) {
|
|
return false;
|
|
}
|
|
|
|
updateUsageStatsEntry(freshStore, profileId, (existing) => resetUsageStats(existing));
|
|
return true;
|
|
},
|
|
});
|
|
if (updated) {
|
|
store.usageStats = updated.usageStats;
|
|
return;
|
|
}
|
|
if (!store.usageStats?.[profileId]) {
|
|
return;
|
|
}
|
|
|
|
updateUsageStatsEntry(store, profileId, (existing) => resetUsageStats(existing));
|
|
saveAuthProfileStore(store, agentDir);
|
|
}
|