ci: gate releases on live cache floors

This commit is contained in:
Peter Steinberger 2026-04-04 15:44:21 +09:00
parent be4eb269fc
commit 6e6b4f6004
No known key found for this signature in database
8 changed files with 634 additions and 8 deletions

View File

@ -129,6 +129,31 @@ jobs:
- name: Verify release contents
run: pnpm release:check
- name: Validate live cache credentials
if: ${{ github.ref == 'refs/heads/main' }}
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
set -euo pipefail
if [[ -z "${OPENAI_API_KEY}" ]]; then
echo "Missing OPENAI_API_KEY secret for release live cache validation." >&2
exit 1
fi
if [[ -z "${ANTHROPIC_API_KEY}" ]]; then
echo "Missing ANTHROPIC_API_KEY secret for release live cache validation." >&2
exit 1
fi
- name: Verify live prompt cache floors
if: ${{ github.ref == 'refs/heads/main' }}
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENCLAW_LIVE_CACHE_TEST: "1"
OPENCLAW_LIVE_TEST: "1"
run: pnpm test:live:cache
- name: Pack prepared npm tarball
id: packed_tarball
env:

View File

@ -41,6 +41,10 @@ OpenClaw has three public release lanes:
`dist/*` release artifacts and Control UI bundle exist for the pack
validation step
- Run `pnpm release:check` before every tagged release
- Main-branch npm preflight also runs
`OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_CACHE_TEST=1 pnpm test:live:cache`
before packaging the tarball, using both `OPENAI_API_KEY` and
`ANTHROPIC_API_KEY` workflow secrets
- Run `RELEASE_TAG=vYYYY.M.D node --import tsx scripts/openclaw-npm-release-check.ts`
(or the matching beta/correction tag) before approval
- After npm publish, run

View File

@ -157,10 +157,19 @@ OpenClaw exposes dedicated cache-trace diagnostics for embedded agent runs.
## Live regression tests
OpenClaw keeps provider-specific live cache probes for repeated prefixes, tool turns, image turns, and MCP-style tool transcripts.
OpenClaw keeps one combined live cache regression gate for repeated prefixes, tool turns, image turns, MCP-style tool transcripts, and an Anthropic no-cache control.
- `src/agents/pi-embedded-runner.cache.live.test.ts`
- `src/agents/pi-mcp-style.cache.live.test.ts`
- `src/agents/live-cache-regression.live.test.ts`
- `src/agents/live-cache-regression-baseline.ts`
Run the narrow live gate with:
```sh
OPENCLAW_LIVE_TEST=1 OPENCLAW_LIVE_CACHE_TEST=1 pnpm test:live:cache
```
The baseline file stores the most recent observed live numbers plus the provider-specific regression floors used by the test.
The runner also uses fresh per-run session IDs and prompt namespaces so previous cache state does not pollute the current regression sample.
These tests intentionally do not use identical success criteria across providers.
@ -180,12 +189,14 @@ These tests intentionally do not use identical success criteria across providers
- image transcript: `cacheRead >= 3840`, hit rate `>= 0.82`
- MCP-style transcript: `cacheRead >= 4096`, hit rate `>= 0.85`
Fresh OpenAI verification on 2026-04-04 landed at:
Fresh combined live verification on 2026-04-04 landed at:
- stable prefix: `cacheRead=4864`, hit rate `0.971`
- tool transcript: `cacheRead=4608`, hit rate `0.900`
- image transcript: `cacheRead=4864`, hit rate `0.959`
- MCP-style transcript: `cacheRead=4608`, hit rate `0.895`
- stable prefix: `cacheRead=4864`, hit rate `0.966`
- tool transcript: `cacheRead=4608`, hit rate `0.896`
- image transcript: `cacheRead=4864`, hit rate `0.954`
- MCP-style transcript: `cacheRead=4608`, hit rate `0.891`
Recent local wall-clock time for the combined gate was about `88s`.
Why the assertions differ:

View File

@ -1149,6 +1149,7 @@
"test:install:e2e:openai": "OPENCLAW_E2E_MODELS=openai bash scripts/test-install-sh-e2e-docker.sh",
"test:install:smoke": "bash scripts/test-install-sh-docker.sh",
"test:live": "node scripts/test-live.mjs",
"test:live:cache": "bun scripts/check-live-cache.ts",
"test:live:gateway-profiles": "node scripts/test-live.mjs -- src/gateway/gateway-models.profiles.live.test.ts",
"test:live:models-profiles": "node scripts/test-live.mjs -- src/agents/models.profiles.live.test.ts",
"test:max": "OPENCLAW_VITEST_MAX_WORKERS=8 vitest run --config vitest.config.ts",

View File

@ -0,0 +1,18 @@
import { runLiveCacheRegression } from "../src/agents/live-cache-regression-runner.js";
import { LIVE_CACHE_TEST_ENABLED, logLiveCache } from "../src/agents/live-cache-test-support.js";
if (!LIVE_CACHE_TEST_ENABLED) {
logLiveCache("skipped; set OPENCLAW_LIVE_TEST=1 and OPENCLAW_LIVE_CACHE_TEST=1");
process.exit(0);
}
const result = await runLiveCacheRegression();
if (result.regressions.length > 0) {
process.stderr.write("\n[live-cache] regressions detected:\n");
for (const regression of result.regressions) {
process.stderr.write(`- ${regression}\n`);
}
process.exitCode = 1;
} else {
process.stderr.write("\n[live-cache] all regression floors satisfied\n");
}

View File

@ -0,0 +1,79 @@
export type LiveCacheFloor = {
observedCacheRead?: number;
observedCacheWrite?: number;
observedHitRate?: number;
minCacheRead?: number;
minCacheWrite?: number;
minHitRate?: number;
maxCacheRead?: number;
maxCacheWrite?: number;
};
export const LIVE_CACHE_REGRESSION_BASELINE = {
anthropic: {
disabled: {
observedCacheRead: 0,
observedCacheWrite: 0,
maxCacheRead: 32,
maxCacheWrite: 32,
},
image: {
observedCacheRead: 5_660,
observedCacheWrite: 85,
observedHitRate: 0.985,
minCacheRead: 4_500,
minCacheWrite: 1,
minHitRate: 0.97,
},
mcp: {
observedCacheRead: 6_240,
observedCacheWrite: 113,
observedHitRate: 0.982,
minCacheRead: 5_800,
minCacheWrite: 1,
minHitRate: 0.97,
},
stable: {
observedCacheRead: 5_660,
observedCacheWrite: 18,
observedHitRate: 0.996,
minCacheRead: 5_400,
minCacheWrite: 1,
minHitRate: 0.97,
},
tool: {
observedCacheRead: 6_223,
observedCacheWrite: 97,
observedHitRate: 0.984,
minCacheRead: 5_000,
minCacheWrite: 1,
minHitRate: 0.97,
},
},
openai: {
image: {
observedCacheRead: 4_864,
observedHitRate: 0.954,
minCacheRead: 3_840,
minHitRate: 0.82,
},
mcp: {
observedCacheRead: 4_608,
observedHitRate: 0.891,
minCacheRead: 4_096,
minHitRate: 0.85,
},
stable: {
observedCacheRead: 4_864,
observedHitRate: 0.966,
minCacheRead: 4_608,
minHitRate: 0.9,
},
tool: {
observedCacheRead: 4_608,
observedHitRate: 0.896,
minCacheRead: 4_096,
minHitRate: 0.85,
},
},
} as const satisfies Record<string, Record<string, LiveCacheFloor>>;

View File

@ -0,0 +1,472 @@
import fs from "node:fs/promises";
import type { AssistantMessage, Message, Tool } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import { LIVE_CACHE_REGRESSION_BASELINE } from "./live-cache-regression-baseline.js";
import {
buildAssistantHistoryTurn,
buildStableCachePrefix,
completeSimpleWithLiveTimeout,
computeCacheHitRate,
extractAssistantText,
logLiveCache,
resolveLiveDirectModel,
} from "./live-cache-test-support.js";
const OPENAI_TIMEOUT_MS = 120_000;
const ANTHROPIC_TIMEOUT_MS = 120_000;
const OPENAI_PREFIX = buildStableCachePrefix("openai");
const OPENAI_MCP_PREFIX = buildStableCachePrefix("openai-mcp-style");
const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
const LIVE_TEST_PNG_URL = new URL(
"../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
import.meta.url,
);
type LiveResolvedModel = Awaited<ReturnType<typeof resolveLiveDirectModel>>;
type ProviderKey = keyof typeof LIVE_CACHE_REGRESSION_BASELINE;
type CacheLane = "image" | "mcp" | "stable" | "tool";
type CacheRun = {
hitRate: number;
suffix: string;
text: string;
usage: AssistantMessage["usage"];
};
type LaneResult = {
best?: CacheRun;
disabled?: CacheRun;
warmup?: CacheRun;
};
export type LiveCacheRegressionResult = {
regressions: string[];
summary: Record<string, Record<string, unknown>>;
};
const NOOP_TOOL: Tool = {
name: "noop",
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
const MCP_TOOL: Tool = {
name: "bundleProbe__bundle_probe",
description: "Return bundle MCP probe text.",
parameters: Type.Object({}, { additionalProperties: false }),
};
function makeUserTurn(content: Extract<Message, { role: "user" }>["content"]): Message {
return {
role: "user",
content,
timestamp: Date.now(),
};
}
function makeImageUserTurn(text: string, pngBase64: string): Message {
return makeUserTurn([
{ type: "text", text },
{ type: "image", mimeType: "image/png", data: pngBase64 },
]);
}
function makeToolResultMessage(
toolCallId: string,
toolName: string,
text: string,
): Extract<Message, { role: "toolResult" }> {
return {
role: "toolResult",
toolCallId,
toolName,
content: [{ type: "text", text }],
isError: false,
timestamp: Date.now(),
};
}
function extractFirstToolCall(message: AssistantMessage) {
return message.content.find((block) => block.type === "toolCall");
}
function assert(condition: unknown, message: string): asserts condition {
if (!condition) {
throw new Error(message);
}
}
async function runToolOnlyTurn(params: {
apiKey: string;
cacheRetention: "none" | "short" | "long";
model: LiveResolvedModel["model"];
providerTag: "anthropic" | "openai";
sessionId: string;
systemPrompt: string;
tool: Tool;
}) {
const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
const options = {
apiKey: params.apiKey,
cacheRetention: params.cacheRetention,
sessionId: params.sessionId,
maxTokens: 128,
temperature: 0,
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
};
let prompt = `Call the tool \`${params.tool.name}\` with {}. IMPORTANT: respond ONLY with the tool call and no other text.`;
let response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: params.systemPrompt,
messages: [makeUserTurn(prompt)],
tools: [params.tool],
},
options,
`${params.providerTag} ${params.tool.name} tool-only turn`,
timeoutMs,
);
let toolCall = extractFirstToolCall(response);
let text = extractAssistantText(response);
for (let attempt = 0; attempt < 2 && (!toolCall || text.length > 0); attempt += 1) {
prompt = `Return only a tool call for \`${params.tool.name}\` with {}. No text.`;
response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: params.systemPrompt,
messages: [makeUserTurn(prompt)],
tools: [params.tool],
},
options,
`${params.providerTag} ${params.tool.name} tool-only retry ${attempt + 1}`,
timeoutMs,
);
toolCall = extractFirstToolCall(response);
text = extractAssistantText(response);
}
assert(toolCall, `expected tool call for ${params.tool.name}`);
assert(
text.length === 0,
`expected tool-only response for ${params.tool.name}, got ${JSON.stringify(text)}`,
);
assert(toolCall.type === "toolCall", `expected toolCall block for ${params.tool.name}`);
return {
prompt,
response,
toolCall,
};
}
async function completeCacheProbe(params: {
apiKey: string;
cacheRetention: "none" | "short" | "long";
messages: Message[];
model: LiveResolvedModel["model"];
providerTag: "anthropic" | "openai";
sessionId: string;
suffix: string;
systemPrompt: string;
tools?: Tool[];
maxTokens?: number;
}): Promise<CacheRun> {
const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
const response = await completeSimpleWithLiveTimeout(
params.model,
{
systemPrompt: params.systemPrompt,
messages: params.messages,
...(params.tools ? { tools: params.tools } : {}),
},
{
apiKey: params.apiKey,
cacheRetention: params.cacheRetention,
sessionId: params.sessionId,
maxTokens: params.maxTokens ?? 64,
temperature: 0,
...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
},
`${params.providerTag} cache lane ${params.suffix}`,
timeoutMs,
);
const text = extractAssistantText(response);
assert(
text.toLowerCase().includes(params.suffix.toLowerCase()),
`expected response to contain ${params.suffix}, got ${JSON.stringify(text)}`,
);
return {
suffix: params.suffix,
text,
usage: response.usage,
hitRate: computeCacheHitRate(response.usage),
};
}
async function runRepeatedLane(params: {
lane: CacheLane;
providerTag: "anthropic" | "openai";
fixture: LiveResolvedModel;
runToken: string;
sessionId: string;
pngBase64: string;
}): Promise<LaneResult> {
const suffixBase = `${params.providerTag}-${params.lane}`;
const systemPromptBase =
params.providerTag === "openai"
? params.lane === "mcp"
? OPENAI_MCP_PREFIX
: OPENAI_PREFIX
: ANTHROPIC_PREFIX;
const systemPrompt = `${systemPromptBase}\nRun token: ${params.runToken}\nLane: ${params.providerTag}-${params.lane}\n`;
const run =
params.lane === "stable"
? (suffix: string) =>
completeCacheProbe({
apiKey: params.fixture.apiKey,
cacheRetention: "short",
messages: [makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`)],
model: params.fixture.model,
providerTag: params.providerTag,
sessionId: params.sessionId,
suffix,
systemPrompt,
maxTokens: 32,
})
: params.lane === "image"
? (suffix: string) =>
completeCacheProbe({
apiKey: params.fixture.apiKey,
cacheRetention: "short",
messages: [
makeImageUserTurn(
"An image is attached. Ignore image semantics but keep the bytes in history.",
params.pngBase64,
),
buildAssistantHistoryTurn("IMAGE HISTORY ACKNOWLEDGED", params.fixture.model),
makeUserTurn("Keep the earlier image turn stable in context."),
buildAssistantHistoryTurn("IMAGE HISTORY PRESERVED", params.fixture.model),
makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
],
model: params.fixture.model,
providerTag: params.providerTag,
sessionId: params.sessionId,
suffix,
systemPrompt,
})
: async (suffix: string) => {
const tool = params.lane === "mcp" ? MCP_TOOL : NOOP_TOOL;
const toolText = params.lane === "mcp" ? "FROM-BUNDLE" : "ok";
const historyPrefix = params.lane === "mcp" ? "MCP TOOL HISTORY" : "TOOL HISTORY";
const toolTurn = await runToolOnlyTurn({
apiKey: params.fixture.apiKey,
cacheRetention: "short",
model: params.fixture.model,
providerTag: params.providerTag,
sessionId: params.sessionId,
systemPrompt,
tool,
});
return await completeCacheProbe({
apiKey: params.fixture.apiKey,
cacheRetention: "short",
messages: [
makeUserTurn(toolTurn.prompt),
toolTurn.response,
makeToolResultMessage(toolTurn.toolCall.id, tool.name, toolText),
buildAssistantHistoryTurn(`${historyPrefix} ACKNOWLEDGED`, params.fixture.model),
makeUserTurn(
params.lane === "mcp"
? "Keep the MCP tool output stable in history."
: "Keep the tool output stable in history.",
),
buildAssistantHistoryTurn(`${historyPrefix} PRESERVED`, params.fixture.model),
makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
],
model: params.fixture.model,
providerTag: params.providerTag,
sessionId: params.sessionId,
suffix,
systemPrompt,
tools: [tool],
});
};
const warmup = await run(`${suffixBase}-warmup`);
const hitA = await run(`${suffixBase}-hit-a`);
const hitB = await run(`${suffixBase}-hit-b`);
const best = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
return { best, warmup };
}
async function runAnthropicDisabledLane(params: {
fixture: LiveResolvedModel;
runToken: string;
sessionId: string;
}): Promise<LaneResult> {
const disabled = await completeCacheProbe({
apiKey: params.fixture.apiKey,
cacheRetention: "none",
messages: [makeUserTurn("Reply with exactly CACHE-OK anthropic-disabled.")],
model: params.fixture.model,
providerTag: "anthropic",
sessionId: params.sessionId,
suffix: "anthropic-disabled",
systemPrompt: `${ANTHROPIC_PREFIX}\nRun token: ${params.runToken}\nLane: anthropic-disabled\n`,
maxTokens: 32,
});
return { disabled };
}
function formatUsage(usage: AssistantMessage["usage"]) {
return `cacheRead=${usage.cacheRead ?? 0} cacheWrite=${usage.cacheWrite ?? 0} input=${usage.input ?? 0}`;
}
function assertAgainstBaseline(params: {
lane: string;
provider: ProviderKey;
result: LaneResult;
regressions: string[];
}) {
const floor =
LIVE_CACHE_REGRESSION_BASELINE[params.provider][
params.lane as keyof (typeof LIVE_CACHE_REGRESSION_BASELINE)[typeof params.provider]
];
if (!floor) {
params.regressions.push(`${params.provider}:${params.lane} missing baseline entry`);
return;
}
if (params.result.best) {
const usage = params.result.best.usage;
if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) {
params.regressions.push(
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`,
);
}
if (params.result.best.hitRate < (floor.minHitRate ?? 0)) {
params.regressions.push(
`${params.provider}:${params.lane} hitRate=${params.result.best.hitRate.toFixed(3)} < min=${floor.minHitRate?.toFixed(3)}`,
);
}
}
if (params.result.warmup) {
const warmupUsage = params.result.warmup.usage;
if ((warmupUsage.cacheWrite ?? 0) < (floor.minCacheWrite ?? 0)) {
params.regressions.push(
`${params.provider}:${params.lane} warmup cacheWrite=${warmupUsage.cacheWrite ?? 0} < min=${floor.minCacheWrite}`,
);
}
}
if (params.result.disabled) {
const usage = params.result.disabled.usage;
if ((usage.cacheRead ?? 0) > (floor.maxCacheRead ?? Number.POSITIVE_INFINITY)) {
params.regressions.push(
`${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} > max=${floor.maxCacheRead}`,
);
}
if ((usage.cacheWrite ?? 0) > (floor.maxCacheWrite ?? Number.POSITIVE_INFINITY)) {
params.regressions.push(
`${params.provider}:${params.lane} cacheWrite=${usage.cacheWrite ?? 0} > max=${floor.maxCacheWrite}`,
);
}
}
}
export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResult> {
const pngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64");
const runToken = `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
const openai = await resolveLiveDirectModel({
provider: "openai",
api: "openai-responses",
envVar: "OPENCLAW_LIVE_OPENAI_CACHE_MODEL",
preferredModelIds: ["gpt-5.4-mini", "gpt-5.4", "gpt-5.2"],
});
const anthropic = await resolveLiveDirectModel({
provider: "anthropic",
api: "anthropic-messages",
envVar: "OPENCLAW_LIVE_ANTHROPIC_CACHE_MODEL",
preferredModelIds: ["claude-sonnet-4-6", "claude-sonnet-4-5", "claude-haiku-3-5"],
});
const regressions: string[] = [];
const summary: Record<string, Record<string, unknown>> = {
anthropic: {},
openai: {},
};
for (const lane of ["stable", "tool", "image", "mcp"] as const) {
const openaiResult = await runRepeatedLane({
lane,
providerTag: "openai",
fixture: openai,
runToken,
sessionId: `live-cache-regression-${runToken}-openai-${lane}`,
pngBase64,
});
logLiveCache(
`openai ${lane} warmup ${formatUsage(openaiResult.warmup?.usage ?? {})} rate=${openaiResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
);
logLiveCache(
`openai ${lane} best ${formatUsage(openaiResult.best?.usage ?? {})} rate=${openaiResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
);
summary.openai[lane] = {
best: openaiResult.best?.usage,
hitRate: openaiResult.best?.hitRate,
warmup: openaiResult.warmup?.usage,
};
assertAgainstBaseline({
lane,
provider: "openai",
result: openaiResult,
regressions,
});
const anthropicResult = await runRepeatedLane({
lane,
providerTag: "anthropic",
fixture: anthropic,
runToken,
sessionId: `live-cache-regression-${runToken}-anthropic-${lane}`,
pngBase64,
});
logLiveCache(
`anthropic ${lane} warmup ${formatUsage(anthropicResult.warmup?.usage ?? {})} rate=${anthropicResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
);
logLiveCache(
`anthropic ${lane} best ${formatUsage(anthropicResult.best?.usage ?? {})} rate=${anthropicResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
);
summary.anthropic[lane] = {
best: anthropicResult.best?.usage,
hitRate: anthropicResult.best?.hitRate,
warmup: anthropicResult.warmup?.usage,
};
assertAgainstBaseline({
lane,
provider: "anthropic",
result: anthropicResult,
regressions,
});
}
const disabled = await runAnthropicDisabledLane({
fixture: anthropic,
runToken,
sessionId: `live-cache-regression-${runToken}-anthropic-disabled`,
});
logLiveCache(`anthropic disabled ${formatUsage(disabled.disabled?.usage ?? {})}`);
summary.anthropic.disabled = {
disabled: disabled.disabled?.usage,
};
assertAgainstBaseline({
lane: "disabled",
provider: "anthropic",
result: disabled,
regressions,
});
logLiveCache(`cache regression summary ${JSON.stringify(summary)}`);
return { regressions, summary };
}

View File

@ -0,0 +1,16 @@
import { describe, expect, it } from "vitest";
import { runLiveCacheRegression } from "./live-cache-regression-runner.js";
import { LIVE_CACHE_TEST_ENABLED } from "./live-cache-test-support.js";
const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
describeCacheLive("live cache regression", () => {
it(
"matches the stored provider cache baselines",
async () => {
const result = await runLiveCacheRegression();
expect(result.regressions).toEqual([]);
},
30 * 60_000,
);
});