mirror of https://github.com/openclaw/openclaw.git
ci: gate releases on live cache floors
This commit is contained in:
parent
be4eb269fc
commit
6e6b4f6004
|
|
@ -129,6 +129,31 @@ jobs:
|
|||
- name: Verify release contents
|
||||
run: pnpm release:check
|
||||
|
||||
- name: Validate live cache credentials
|
||||
if: ${{ github.ref == 'refs/heads/main' }}
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [[ -z "${OPENAI_API_KEY}" ]]; then
|
||||
echo "Missing OPENAI_API_KEY secret for release live cache validation." >&2
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z "${ANTHROPIC_API_KEY}" ]]; then
|
||||
echo "Missing ANTHROPIC_API_KEY secret for release live cache validation." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Verify live prompt cache floors
|
||||
if: ${{ github.ref == 'refs/heads/main' }}
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
OPENCLAW_LIVE_CACHE_TEST: "1"
|
||||
OPENCLAW_LIVE_TEST: "1"
|
||||
run: pnpm test:live:cache
|
||||
|
||||
- name: Pack prepared npm tarball
|
||||
id: packed_tarball
|
||||
env:
|
||||
|
|
|
|||
|
|
@ -41,6 +41,10 @@ OpenClaw has three public release lanes:
|
|||
`dist/*` release artifacts and Control UI bundle exist for the pack
|
||||
validation step
|
||||
- Run `pnpm release:check` before every tagged release
|
||||
- Main-branch npm preflight also runs
|
||||
`OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_CACHE_TEST=1 pnpm test:live:cache`
|
||||
before packaging the tarball, using both `OPENAI_API_KEY` and
|
||||
`ANTHROPIC_API_KEY` workflow secrets
|
||||
- Run `RELEASE_TAG=vYYYY.M.D node --import tsx scripts/openclaw-npm-release-check.ts`
|
||||
(or the matching beta/correction tag) before approval
|
||||
- After npm publish, run
|
||||
|
|
|
|||
|
|
@ -157,10 +157,19 @@ OpenClaw exposes dedicated cache-trace diagnostics for embedded agent runs.
|
|||
|
||||
## Live regression tests
|
||||
|
||||
OpenClaw keeps provider-specific live cache probes for repeated prefixes, tool turns, image turns, and MCP-style tool transcripts.
|
||||
OpenClaw keeps one combined live cache regression gate for repeated prefixes, tool turns, image turns, MCP-style tool transcripts, and an Anthropic no-cache control.
|
||||
|
||||
- `src/agents/pi-embedded-runner.cache.live.test.ts`
|
||||
- `src/agents/pi-mcp-style.cache.live.test.ts`
|
||||
- `src/agents/live-cache-regression.live.test.ts`
|
||||
- `src/agents/live-cache-regression-baseline.ts`
|
||||
|
||||
Run the narrow live gate with:
|
||||
|
||||
```sh
|
||||
OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_CACHE_TEST=1 pnpm test:live:cache
|
||||
```
|
||||
|
||||
The baseline file stores the most recent observed live numbers plus the provider-specific regression floors used by the test.
|
||||
The runner also uses fresh per-run session IDs and prompt namespaces so previous cache state does not pollute the current regression sample.
|
||||
|
||||
These tests intentionally do not use identical success criteria across providers.
|
||||
|
||||
|
|
@ -180,12 +189,14 @@ These tests intentionally do not use identical success criteria across providers
|
|||
- image transcript: `cacheRead >= 3840`, hit rate `>= 0.82`
|
||||
- MCP-style transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
|
||||
|
||||
Fresh OpenAI verification on 2026-04-04 landed at:
|
||||
Fresh combined live verification on 2026-04-04 landed at:
|
||||
|
||||
- stable prefix: `cacheRead=4864`, hit rate `0.971`
|
||||
- tool transcript: `cacheRead=4608`, hit rate `0.900`
|
||||
- image transcript: `cacheRead=4864`, hit rate `0.959`
|
||||
- MCP-style transcript: `cacheRead=4608`, hit rate `0.895`
|
||||
- stable prefix: `cacheRead=4864`, hit rate `0.966`
|
||||
- tool transcript: `cacheRead=4608`, hit rate `0.896`
|
||||
- image transcript: `cacheRead=4864`, hit rate `0.954`
|
||||
- MCP-style transcript: `cacheRead=4608`, hit rate `0.891`
|
||||
|
||||
Recent local wall-clock time for the combined gate was about `88s`.
|
||||
|
||||
Why the assertions differ:
|
||||
|
||||
|
|
|
|||
|
|
@ -1149,6 +1149,7 @@
|
|||
"test:install:e2e:openai": "OPENCLAW_E2E_MODELS=openai bash scripts/test-install-sh-e2e-docker.sh",
|
||||
"test:install:smoke": "bash scripts/test-install-sh-docker.sh",
|
||||
"test:live": "node scripts/test-live.mjs",
|
||||
"test:live:cache": "bun scripts/check-live-cache.ts",
|
||||
"test:live:gateway-profiles": "node scripts/test-live.mjs -- src/gateway/gateway-models.profiles.live.test.ts",
|
||||
"test:live:models-profiles": "node scripts/test-live.mjs -- src/agents/models.profiles.live.test.ts",
|
||||
"test:max": "OPENCLAW_VITEST_MAX_WORKERS=8 vitest run --config vitest.config.ts",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,18 @@
|
|||
import { runLiveCacheRegression } from "../src/agents/live-cache-regression-runner.js";
|
||||
import { LIVE_CACHE_TEST_ENABLED, logLiveCache } from "../src/agents/live-cache-test-support.js";
|
||||
|
||||
if (!LIVE_CACHE_TEST_ENABLED) {
|
||||
logLiveCache("skipped; set OPENCLAW_LIVE_TEST=1 and OPENCLAW_LIVE_CACHE_TEST=1");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await runLiveCacheRegression();
|
||||
if (result.regressions.length > 0) {
|
||||
process.stderr.write("\n[live-cache] regressions detected:\n");
|
||||
for (const regression of result.regressions) {
|
||||
process.stderr.write(`- ${regression}\n`);
|
||||
}
|
||||
process.exitCode = 1;
|
||||
} else {
|
||||
process.stderr.write("\n[live-cache] all regression floors satisfied\n");
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
export type LiveCacheFloor = {
|
||||
observedCacheRead?: number;
|
||||
observedCacheWrite?: number;
|
||||
observedHitRate?: number;
|
||||
minCacheRead?: number;
|
||||
minCacheWrite?: number;
|
||||
minHitRate?: number;
|
||||
maxCacheRead?: number;
|
||||
maxCacheWrite?: number;
|
||||
};
|
||||
|
||||
export const LIVE_CACHE_REGRESSION_BASELINE = {
|
||||
anthropic: {
|
||||
disabled: {
|
||||
observedCacheRead: 0,
|
||||
observedCacheWrite: 0,
|
||||
maxCacheRead: 32,
|
||||
maxCacheWrite: 32,
|
||||
},
|
||||
image: {
|
||||
observedCacheRead: 5_660,
|
||||
observedCacheWrite: 85,
|
||||
observedHitRate: 0.985,
|
||||
minCacheRead: 4_500,
|
||||
minCacheWrite: 1,
|
||||
minHitRate: 0.97,
|
||||
},
|
||||
mcp: {
|
||||
observedCacheRead: 6_240,
|
||||
observedCacheWrite: 113,
|
||||
observedHitRate: 0.982,
|
||||
minCacheRead: 5_800,
|
||||
minCacheWrite: 1,
|
||||
minHitRate: 0.97,
|
||||
},
|
||||
stable: {
|
||||
observedCacheRead: 5_660,
|
||||
observedCacheWrite: 18,
|
||||
observedHitRate: 0.996,
|
||||
minCacheRead: 5_400,
|
||||
minCacheWrite: 1,
|
||||
minHitRate: 0.97,
|
||||
},
|
||||
tool: {
|
||||
observedCacheRead: 6_223,
|
||||
observedCacheWrite: 97,
|
||||
observedHitRate: 0.984,
|
||||
minCacheRead: 5_000,
|
||||
minCacheWrite: 1,
|
||||
minHitRate: 0.97,
|
||||
},
|
||||
},
|
||||
openai: {
|
||||
image: {
|
||||
observedCacheRead: 4_864,
|
||||
observedHitRate: 0.954,
|
||||
minCacheRead: 3_840,
|
||||
minHitRate: 0.82,
|
||||
},
|
||||
mcp: {
|
||||
observedCacheRead: 4_608,
|
||||
observedHitRate: 0.891,
|
||||
minCacheRead: 4_096,
|
||||
minHitRate: 0.85,
|
||||
},
|
||||
stable: {
|
||||
observedCacheRead: 4_864,
|
||||
observedHitRate: 0.966,
|
||||
minCacheRead: 4_608,
|
||||
minHitRate: 0.9,
|
||||
},
|
||||
tool: {
|
||||
observedCacheRead: 4_608,
|
||||
observedHitRate: 0.896,
|
||||
minCacheRead: 4_096,
|
||||
minHitRate: 0.85,
|
||||
},
|
||||
},
|
||||
} as const satisfies Record<string, Record<string, LiveCacheFloor>>;
|
||||
|
|
@ -0,0 +1,472 @@
|
|||
import fs from "node:fs/promises";
|
||||
import type { AssistantMessage, Message, Tool } from "@mariozechner/pi-ai";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { LIVE_CACHE_REGRESSION_BASELINE } from "./live-cache-regression-baseline.js";
|
||||
import {
|
||||
buildAssistantHistoryTurn,
|
||||
buildStableCachePrefix,
|
||||
completeSimpleWithLiveTimeout,
|
||||
computeCacheHitRate,
|
||||
extractAssistantText,
|
||||
logLiveCache,
|
||||
resolveLiveDirectModel,
|
||||
} from "./live-cache-test-support.js";
|
||||
|
||||
const OPENAI_TIMEOUT_MS = 120_000;
|
||||
const ANTHROPIC_TIMEOUT_MS = 120_000;
|
||||
const OPENAI_PREFIX = buildStableCachePrefix("openai");
|
||||
const OPENAI_MCP_PREFIX = buildStableCachePrefix("openai-mcp-style");
|
||||
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
|
||||
const LIVE_TEST_PNG_URL = new URL(
|
||||
"../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
|
||||
import.meta.url,
|
||||
);
|
||||
|
||||
type LiveResolvedModel = Awaited<ReturnType<typeof resolveLiveDirectModel>>;
|
||||
type ProviderKey = keyof typeof LIVE_CACHE_REGRESSION_BASELINE;
|
||||
type CacheLane = "image" | "mcp" | "stable" | "tool";
|
||||
type CacheRun = {
|
||||
hitRate: number;
|
||||
suffix: string;
|
||||
text: string;
|
||||
usage: AssistantMessage["usage"];
|
||||
};
|
||||
type LaneResult = {
|
||||
best?: CacheRun;
|
||||
disabled?: CacheRun;
|
||||
warmup?: CacheRun;
|
||||
};
|
||||
|
||||
export type LiveCacheRegressionResult = {
|
||||
regressions: string[];
|
||||
summary: Record<string, Record<string, unknown>>;
|
||||
};
|
||||
|
||||
const NOOP_TOOL: Tool = {
|
||||
name: "noop",
|
||||
description: "Return ok.",
|
||||
parameters: Type.Object({}, { additionalProperties: false }),
|
||||
};
|
||||
|
||||
const MCP_TOOL: Tool = {
|
||||
name: "bundleProbe__bundle_probe",
|
||||
description: "Return bundle MCP probe text.",
|
||||
parameters: Type.Object({}, { additionalProperties: false }),
|
||||
};
|
||||
|
||||
function makeUserTurn(content: Extract<Message, { role: "user" }>["content"]): Message {
|
||||
return {
|
||||
role: "user",
|
||||
content,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
function makeImageUserTurn(text: string, pngBase64: string): Message {
|
||||
return makeUserTurn([
|
||||
{ type: "text", text },
|
||||
{ type: "image", mimeType: "image/png", data: pngBase64 },
|
||||
]);
|
||||
}
|
||||
|
||||
function makeToolResultMessage(
|
||||
toolCallId: string,
|
||||
toolName: string,
|
||||
text: string,
|
||||
): Extract<Message, { role: "toolResult" }> {
|
||||
return {
|
||||
role: "toolResult",
|
||||
toolCallId,
|
||||
toolName,
|
||||
content: [{ type: "text", text }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
}
|
||||
|
||||
function extractFirstToolCall(message: AssistantMessage) {
|
||||
return message.content.find((block) => block.type === "toolCall");
|
||||
}
|
||||
|
||||
function assert(condition: unknown, message: string): asserts condition {
|
||||
if (!condition) {
|
||||
throw new Error(message);
|
||||
}
|
||||
}
|
||||
|
||||
async function runToolOnlyTurn(params: {
|
||||
apiKey: string;
|
||||
cacheRetention: "none" | "short" | "long";
|
||||
model: LiveResolvedModel["model"];
|
||||
providerTag: "anthropic" | "openai";
|
||||
sessionId: string;
|
||||
systemPrompt: string;
|
||||
tool: Tool;
|
||||
}) {
|
||||
const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
|
||||
const options = {
|
||||
apiKey: params.apiKey,
|
||||
cacheRetention: params.cacheRetention,
|
||||
sessionId: params.sessionId,
|
||||
maxTokens: 128,
|
||||
temperature: 0,
|
||||
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
|
||||
};
|
||||
let prompt = `Call the tool \`${params.tool.name}\` with {}. IMPORTANT: respond ONLY with the tool call and no other text.`;
|
||||
let response = await completeSimpleWithLiveTimeout(
|
||||
params.model,
|
||||
{
|
||||
systemPrompt: params.systemPrompt,
|
||||
messages: [makeUserTurn(prompt)],
|
||||
tools: [params.tool],
|
||||
},
|
||||
options,
|
||||
`${params.providerTag} ${params.tool.name} tool-only turn`,
|
||||
timeoutMs,
|
||||
);
|
||||
|
||||
let toolCall = extractFirstToolCall(response);
|
||||
let text = extractAssistantText(response);
|
||||
for (let attempt = 0; attempt < 2 && (!toolCall || text.length > 0); attempt += 1) {
|
||||
prompt = `Return only a tool call for \`${params.tool.name}\` with {}. No text.`;
|
||||
response = await completeSimpleWithLiveTimeout(
|
||||
params.model,
|
||||
{
|
||||
systemPrompt: params.systemPrompt,
|
||||
messages: [makeUserTurn(prompt)],
|
||||
tools: [params.tool],
|
||||
},
|
||||
options,
|
||||
`${params.providerTag} ${params.tool.name} tool-only retry ${attempt + 1}`,
|
||||
timeoutMs,
|
||||
);
|
||||
toolCall = extractFirstToolCall(response);
|
||||
text = extractAssistantText(response);
|
||||
}
|
||||
|
||||
assert(toolCall, `expected tool call for ${params.tool.name}`);
|
||||
assert(
|
||||
text.length === 0,
|
||||
`expected tool-only response for ${params.tool.name}, got ${JSON.stringify(text)}`,
|
||||
);
|
||||
assert(toolCall.type === "toolCall", `expected toolCall block for ${params.tool.name}`);
|
||||
|
||||
return {
|
||||
prompt,
|
||||
response,
|
||||
toolCall,
|
||||
};
|
||||
}
|
||||
|
||||
async function completeCacheProbe(params: {
|
||||
apiKey: string;
|
||||
cacheRetention: "none" | "short" | "long";
|
||||
messages: Message[];
|
||||
model: LiveResolvedModel["model"];
|
||||
providerTag: "anthropic" | "openai";
|
||||
sessionId: string;
|
||||
suffix: string;
|
||||
systemPrompt: string;
|
||||
tools?: Tool[];
|
||||
maxTokens?: number;
|
||||
}): Promise<CacheRun> {
|
||||
const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
|
||||
const response = await completeSimpleWithLiveTimeout(
|
||||
params.model,
|
||||
{
|
||||
systemPrompt: params.systemPrompt,
|
||||
messages: params.messages,
|
||||
...(params.tools ? { tools: params.tools } : {}),
|
||||
},
|
||||
{
|
||||
apiKey: params.apiKey,
|
||||
cacheRetention: params.cacheRetention,
|
||||
sessionId: params.sessionId,
|
||||
maxTokens: params.maxTokens ?? 64,
|
||||
temperature: 0,
|
||||
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
|
||||
},
|
||||
`${params.providerTag} cache lane ${params.suffix}`,
|
||||
timeoutMs,
|
||||
);
|
||||
const text = extractAssistantText(response);
|
||||
assert(
|
||||
text.toLowerCase().includes(params.suffix.toLowerCase()),
|
||||
`expected response to contain ${params.suffix}, got ${JSON.stringify(text)}`,
|
||||
);
|
||||
return {
|
||||
suffix: params.suffix,
|
||||
text,
|
||||
usage: response.usage,
|
||||
hitRate: computeCacheHitRate(response.usage),
|
||||
};
|
||||
}
|
||||
|
||||
async function runRepeatedLane(params: {
|
||||
lane: CacheLane;
|
||||
providerTag: "anthropic" | "openai";
|
||||
fixture: LiveResolvedModel;
|
||||
runToken: string;
|
||||
sessionId: string;
|
||||
pngBase64: string;
|
||||
}): Promise<LaneResult> {
|
||||
const suffixBase = `${params.providerTag}-${params.lane}`;
|
||||
const systemPromptBase =
|
||||
params.providerTag === "openai"
|
||||
? params.lane === "mcp"
|
||||
? OPENAI_MCP_PREFIX
|
||||
: OPENAI_PREFIX
|
||||
: ANTHROPIC_PREFIX;
|
||||
const systemPrompt = `${systemPromptBase}\nRun token: ${params.runToken}\nLane: ${params.providerTag}-${params.lane}\n`;
|
||||
|
||||
const run =
|
||||
params.lane === "stable"
|
||||
? (suffix: string) =>
|
||||
completeCacheProbe({
|
||||
apiKey: params.fixture.apiKey,
|
||||
cacheRetention: "short",
|
||||
messages: [makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`)],
|
||||
model: params.fixture.model,
|
||||
providerTag: params.providerTag,
|
||||
sessionId: params.sessionId,
|
||||
suffix,
|
||||
systemPrompt,
|
||||
maxTokens: 32,
|
||||
})
|
||||
: params.lane === "image"
|
||||
? (suffix: string) =>
|
||||
completeCacheProbe({
|
||||
apiKey: params.fixture.apiKey,
|
||||
cacheRetention: "short",
|
||||
messages: [
|
||||
makeImageUserTurn(
|
||||
"An image is attached. Ignore image semantics but keep the bytes in history.",
|
||||
params.pngBase64,
|
||||
),
|
||||
buildAssistantHistoryTurn("IMAGE HISTORY ACKNOWLEDGED", params.fixture.model),
|
||||
makeUserTurn("Keep the earlier image turn stable in context."),
|
||||
buildAssistantHistoryTurn("IMAGE HISTORY PRESERVED", params.fixture.model),
|
||||
makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
|
||||
],
|
||||
model: params.fixture.model,
|
||||
providerTag: params.providerTag,
|
||||
sessionId: params.sessionId,
|
||||
suffix,
|
||||
systemPrompt,
|
||||
})
|
||||
: async (suffix: string) => {
|
||||
const tool = params.lane === "mcp" ? MCP_TOOL : NOOP_TOOL;
|
||||
const toolText = params.lane === "mcp" ? "FROM-BUNDLE" : "ok";
|
||||
const historyPrefix = params.lane === "mcp" ? "MCP TOOL HISTORY" : "TOOL HISTORY";
|
||||
const toolTurn = await runToolOnlyTurn({
|
||||
apiKey: params.fixture.apiKey,
|
||||
cacheRetention: "short",
|
||||
model: params.fixture.model,
|
||||
providerTag: params.providerTag,
|
||||
sessionId: params.sessionId,
|
||||
systemPrompt,
|
||||
tool,
|
||||
});
|
||||
return await completeCacheProbe({
|
||||
apiKey: params.fixture.apiKey,
|
||||
cacheRetention: "short",
|
||||
messages: [
|
||||
makeUserTurn(toolTurn.prompt),
|
||||
toolTurn.response,
|
||||
makeToolResultMessage(toolTurn.toolCall.id, tool.name, toolText),
|
||||
buildAssistantHistoryTurn(`${historyPrefix} ACKNOWLEDGED`, params.fixture.model),
|
||||
makeUserTurn(
|
||||
params.lane === "mcp"
|
||||
? "Keep the MCP tool output stable in history."
|
||||
: "Keep the tool output stable in history.",
|
||||
),
|
||||
buildAssistantHistoryTurn(`${historyPrefix} PRESERVED`, params.fixture.model),
|
||||
makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
|
||||
],
|
||||
model: params.fixture.model,
|
||||
providerTag: params.providerTag,
|
||||
sessionId: params.sessionId,
|
||||
suffix,
|
||||
systemPrompt,
|
||||
tools: [tool],
|
||||
});
|
||||
};
|
||||
|
||||
const warmup = await run(`${suffixBase}-warmup`);
|
||||
const hitA = await run(`${suffixBase}-hit-a`);
|
||||
const hitB = await run(`${suffixBase}-hit-b`);
|
||||
const best = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
|
||||
return { best, warmup };
|
||||
}
|
||||
|
||||
async function runAnthropicDisabledLane(params: {
|
||||
fixture: LiveResolvedModel;
|
||||
runToken: string;
|
||||
sessionId: string;
|
||||
}): Promise<LaneResult> {
|
||||
const disabled = await completeCacheProbe({
|
||||
apiKey: params.fixture.apiKey,
|
||||
cacheRetention: "none",
|
||||
messages: [makeUserTurn("Reply with exactly CACHE-OK anthropic-disabled.")],
|
||||
model: params.fixture.model,
|
||||
providerTag: "anthropic",
|
||||
sessionId: params.sessionId,
|
||||
suffix: "anthropic-disabled",
|
||||
systemPrompt: `${ANTHROPIC_PREFIX}\nRun token: ${params.runToken}\nLane: anthropic-disabled\n`,
|
||||
maxTokens: 32,
|
||||
});
|
||||
return { disabled };
|
||||
}
|
||||
|
||||
function formatUsage(usage: AssistantMessage["usage"]) {
|
||||
return `cacheRead=${usage.cacheRead ?? 0} cacheWrite=${usage.cacheWrite ?? 0} input=${usage.input ?? 0}`;
|
||||
}
|
||||
|
||||
function assertAgainstBaseline(params: {
|
||||
lane: string;
|
||||
provider: ProviderKey;
|
||||
result: LaneResult;
|
||||
regressions: string[];
|
||||
}) {
|
||||
const floor =
|
||||
LIVE_CACHE_REGRESSION_BASELINE[params.provider][
|
||||
params.lane as keyof (typeof LIVE_CACHE_REGRESSION_BASELINE)[typeof params.provider]
|
||||
];
|
||||
if (!floor) {
|
||||
params.regressions.push(`${params.provider}:${params.lane} missing baseline entry`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (params.result.best) {
|
||||
const usage = params.result.best.usage;
|
||||
if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) {
|
||||
params.regressions.push(
|
||||
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`,
|
||||
);
|
||||
}
|
||||
if (params.result.best.hitRate < (floor.minHitRate ?? 0)) {
|
||||
params.regressions.push(
|
||||
`${params.provider}:${params.lane} hitRate=${params.result.best.hitRate.toFixed(3)} < min=${floor.minHitRate?.toFixed(3)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.result.warmup) {
|
||||
const warmupUsage = params.result.warmup.usage;
|
||||
if ((warmupUsage.cacheWrite ?? 0) < (floor.minCacheWrite ?? 0)) {
|
||||
params.regressions.push(
|
||||
`${params.provider}:${params.lane} warmup cacheWrite=${warmupUsage.cacheWrite ?? 0} < min=${floor.minCacheWrite}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.result.disabled) {
|
||||
const usage = params.result.disabled.usage;
|
||||
if ((usage.cacheRead ?? 0) > (floor.maxCacheRead ?? Number.POSITIVE_INFINITY)) {
|
||||
params.regressions.push(
|
||||
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} > max=${floor.maxCacheRead}`,
|
||||
);
|
||||
}
|
||||
if ((usage.cacheWrite ?? 0) > (floor.maxCacheWrite ?? Number.POSITIVE_INFINITY)) {
|
||||
params.regressions.push(
|
||||
`${params.provider}:${params.lane} cacheWrite=${usage.cacheWrite ?? 0} > max=${floor.maxCacheWrite}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResult> {
|
||||
const pngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64");
|
||||
const runToken = `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
const openai = await resolveLiveDirectModel({
|
||||
provider: "openai",
|
||||
api: "openai-responses",
|
||||
envVar: "OPENCLAW_LIVE_OPENAI_CACHE_MODEL",
|
||||
preferredModelIds: ["gpt-5.4-mini", "gpt-5.4", "gpt-5.2"],
|
||||
});
|
||||
const anthropic = await resolveLiveDirectModel({
|
||||
provider: "anthropic",
|
||||
api: "anthropic-messages",
|
||||
envVar: "OPENCLAW_LIVE_ANTHROPIC_CACHE_MODEL",
|
||||
preferredModelIds: ["claude-sonnet-4-6", "claude-sonnet-4-5", "claude-haiku-3-5"],
|
||||
});
|
||||
|
||||
const regressions: string[] = [];
|
||||
const summary: Record<string, Record<string, unknown>> = {
|
||||
anthropic: {},
|
||||
openai: {},
|
||||
};
|
||||
|
||||
for (const lane of ["stable", "tool", "image", "mcp"] as const) {
|
||||
const openaiResult = await runRepeatedLane({
|
||||
lane,
|
||||
providerTag: "openai",
|
||||
fixture: openai,
|
||||
runToken,
|
||||
sessionId: `live-cache-regression-${runToken}-openai-${lane}`,
|
||||
pngBase64,
|
||||
});
|
||||
logLiveCache(
|
||||
`openai ${lane} warmup ${formatUsage(openaiResult.warmup?.usage ?? {})} rate=${openaiResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
|
||||
);
|
||||
logLiveCache(
|
||||
`openai ${lane} best ${formatUsage(openaiResult.best?.usage ?? {})} rate=${openaiResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
|
||||
);
|
||||
summary.openai[lane] = {
|
||||
best: openaiResult.best?.usage,
|
||||
hitRate: openaiResult.best?.hitRate,
|
||||
warmup: openaiResult.warmup?.usage,
|
||||
};
|
||||
assertAgainstBaseline({
|
||||
lane,
|
||||
provider: "openai",
|
||||
result: openaiResult,
|
||||
regressions,
|
||||
});
|
||||
|
||||
const anthropicResult = await runRepeatedLane({
|
||||
lane,
|
||||
providerTag: "anthropic",
|
||||
fixture: anthropic,
|
||||
runToken,
|
||||
sessionId: `live-cache-regression-${runToken}-anthropic-${lane}`,
|
||||
pngBase64,
|
||||
});
|
||||
logLiveCache(
|
||||
`anthropic ${lane} warmup ${formatUsage(anthropicResult.warmup?.usage ?? {})} rate=${anthropicResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
|
||||
);
|
||||
logLiveCache(
|
||||
`anthropic ${lane} best ${formatUsage(anthropicResult.best?.usage ?? {})} rate=${anthropicResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
|
||||
);
|
||||
summary.anthropic[lane] = {
|
||||
best: anthropicResult.best?.usage,
|
||||
hitRate: anthropicResult.best?.hitRate,
|
||||
warmup: anthropicResult.warmup?.usage,
|
||||
};
|
||||
assertAgainstBaseline({
|
||||
lane,
|
||||
provider: "anthropic",
|
||||
result: anthropicResult,
|
||||
regressions,
|
||||
});
|
||||
}
|
||||
|
||||
const disabled = await runAnthropicDisabledLane({
|
||||
fixture: anthropic,
|
||||
runToken,
|
||||
sessionId: `live-cache-regression-${runToken}-anthropic-disabled`,
|
||||
});
|
||||
logLiveCache(`anthropic disabled ${formatUsage(disabled.disabled?.usage ?? {})}`);
|
||||
summary.anthropic.disabled = {
|
||||
disabled: disabled.disabled?.usage,
|
||||
};
|
||||
assertAgainstBaseline({
|
||||
lane: "disabled",
|
||||
provider: "anthropic",
|
||||
result: disabled,
|
||||
regressions,
|
||||
});
|
||||
|
||||
logLiveCache(`cache regression summary ${JSON.stringify(summary)}`);
|
||||
return { regressions, summary };
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import { runLiveCacheRegression } from "./live-cache-regression-runner.js";
|
||||
import { LIVE_CACHE_TEST_ENABLED } from "./live-cache-test-support.js";
|
||||
|
||||
const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
|
||||
|
||||
describeCacheLive("live cache regression", () => {
|
||||
it(
|
||||
"matches the stored provider cache baselines",
|
||||
async () => {
|
||||
const result = await runLiveCacheRegression();
|
||||
expect(result.regressions).toEqual([]);
|
||||
},
|
||||
30 * 60_000,
|
||||
);
|
||||
});
|
||||
Loading…
Reference in New Issue