mirror of https://github.com/openclaw/openclaw.git
test: narrow live transcript scaffolding strip
This commit is contained in:
parent
fe5819887b
commit
ff54c02b7d
|
|
@ -435,6 +435,8 @@ These run `pnpm test:live` inside the repo Docker image, mounting your local con
|
|||
The live-model Docker runners also bind-mount the current checkout read-only and
|
||||
stage it into a temporary workdir inside the container. This keeps the runtime
|
||||
image slim while still running Vitest against your exact local source/config.
|
||||
They also set `OPENCLAW_SKIP_CHANNELS=1` so gateway live probes do not start
|
||||
real Telegram/Discord/etc. channel workers inside the container.
|
||||
`test:docker:live-models` still runs `pnpm test:live`, so pass through
|
||||
`OPENCLAW_LIVE_GATEWAY_*` as well when you need to narrow or exclude gateway
|
||||
live coverage from that Docker lane.
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ docker run --rm -t \
|
|||
-e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
|
||||
-e HOME=/home/node \
|
||||
-e NODE_OPTIONS=--disable-warning=ExperimentalWarning \
|
||||
-e OPENCLAW_SKIP_CHANNELS=1 \
|
||||
-e OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED="$AUTH_DIRS_CSV" \
|
||||
-e OPENCLAW_LIVE_TEST=1 \
|
||||
-e OPENCLAW_LIVE_GATEWAY_MODELS="${OPENCLAW_LIVE_GATEWAY_MODELS:-modern}" \
|
||||
|
|
|
|||
|
|
@ -91,6 +91,7 @@ docker run --rm -t \
|
|||
-e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
|
||||
-e HOME=/home/node \
|
||||
-e NODE_OPTIONS=--disable-warning=ExperimentalWarning \
|
||||
-e OPENCLAW_SKIP_CHANNELS=1 \
|
||||
-e OPENCLAW_DOCKER_AUTH_DIRS_RESOLVED="$AUTH_DIRS_CSV" \
|
||||
-e OPENCLAW_LIVE_TEST=1 \
|
||||
-e OPENCLAW_LIVE_MODELS="${OPENCLAW_LIVE_MODELS:-modern}" \
|
||||
|
|
|
|||
|
|
@ -8,6 +8,11 @@ describe("live model error helpers", () => {
|
|||
it("detects generic model-not-found messages", () => {
|
||||
expect(isModelNotFoundErrorMessage('{"code":404,"message":"model not found"}')).toBe(true);
|
||||
expect(isModelNotFoundErrorMessage("model: MiniMax-M2.7-highspeed not found")).toBe(true);
|
||||
expect(
|
||||
isModelNotFoundErrorMessage(
|
||||
"HTTP 400 not_found_error: model: claude-3-5-haiku-20241022 (request_id: req_123)",
|
||||
),
|
||||
).toBe(true);
|
||||
expect(isModelNotFoundErrorMessage("request ended without sending any chunks")).toBe(false);
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import { createServer } from "node:net";
|
|||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import type { Api, Model } from "@mariozechner/pi-ai";
|
||||
import { describe, it } from "vitest";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolveOpenClawAgentDir } from "../agents/agent-paths.js";
|
||||
import { resolveAgentWorkspaceDir } from "../agents/agent-scope.js";
|
||||
import {
|
||||
|
|
@ -17,6 +17,7 @@ import {
|
|||
isAnthropicBillingError,
|
||||
isAnthropicRateLimitError,
|
||||
} from "../agents/live-auth-keys.js";
|
||||
import { isModelNotFoundErrorMessage } from "../agents/live-model-errors.js";
|
||||
import { isModernModelRef } from "../agents/live-model-filter.js";
|
||||
import { isLiveTestEnabled } from "../agents/live-test-helpers.js";
|
||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||
|
|
@ -28,6 +29,7 @@ import { clearRuntimeConfigSnapshot, loadConfig } from "../config/config.js";
|
|||
import type { ModelsConfig, OpenClawConfig, ModelProviderConfig } from "../config/types.js";
|
||||
import { isTruthyEnvValue } from "../infra/env.js";
|
||||
import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
|
||||
import { stripAssistantInternalScaffolding } from "../shared/text/assistant-visible-text.js";
|
||||
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
|
||||
import { GatewayClient } from "./client.js";
|
||||
import { renderCatNoncePngBase64 } from "./live-image-probe.js";
|
||||
|
|
@ -58,6 +60,7 @@ const GATEWAY_LIVE_HEARTBEAT_MS = Math.max(
|
|||
1_000,
|
||||
toInt(process.env.OPENCLAW_LIVE_GATEWAY_HEARTBEAT_MS, 30_000),
|
||||
);
|
||||
const GATEWAY_LIVE_STRIP_SCAFFOLDING_MODEL_KEYS = new Set(["google/gemini-3-flash-preview"]);
|
||||
const GATEWAY_LIVE_MAX_MODELS = resolveGatewayLiveMaxModels();
|
||||
const GATEWAY_LIVE_SUITE_TIMEOUT_MS = resolveGatewayLiveSuiteTimeoutMs(GATEWAY_LIVE_MAX_MODELS);
|
||||
|
||||
|
|
@ -267,6 +270,34 @@ function isMeaningful(text: string): boolean {
|
|||
return true;
|
||||
}
|
||||
|
||||
function shouldStripAssistantScaffoldingForLiveModel(modelKey?: string): boolean {
|
||||
return !!modelKey && GATEWAY_LIVE_STRIP_SCAFFOLDING_MODEL_KEYS.has(modelKey);
|
||||
}
|
||||
|
||||
function maybeStripAssistantScaffoldingForLiveModel(text: string, modelKey?: string): string {
|
||||
if (!shouldStripAssistantScaffoldingForLiveModel(modelKey)) {
|
||||
return text;
|
||||
}
|
||||
return stripAssistantInternalScaffolding(text).trim();
|
||||
}
|
||||
|
||||
describe("maybeStripAssistantScaffoldingForLiveModel", () => {
|
||||
it("strips scaffolding only for the targeted live model", () => {
|
||||
expect(
|
||||
maybeStripAssistantScaffoldingForLiveModel(
|
||||
"<think>hidden</think>Visible",
|
||||
"google/gemini-3-flash-preview",
|
||||
),
|
||||
).toBe("Visible");
|
||||
expect(
|
||||
maybeStripAssistantScaffoldingForLiveModel(
|
||||
"<think>hidden</think>Visible",
|
||||
"google/gemini-3-pro-preview",
|
||||
),
|
||||
).toBe("<think>hidden</think>Visible");
|
||||
});
|
||||
});
|
||||
|
||||
function isGoogleModelNotFoundText(text: string): boolean {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
|
|
@ -370,6 +401,7 @@ async function runAnthropicRefusalProbe(params: {
|
|||
message: `Reply with the single word ok. Test token: ${magic}`,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
context: `${params.label}: refusal-probe`,
|
||||
modelKey: params.modelKey,
|
||||
});
|
||||
assertNoReasoningTags({
|
||||
text: probeText,
|
||||
|
|
@ -388,6 +420,7 @@ async function runAnthropicRefusalProbe(params: {
|
|||
message: "Now reply with exactly: still ok.",
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
context: `${params.label}: refusal-followup`,
|
||||
modelKey: params.modelKey,
|
||||
});
|
||||
assertNoReasoningTags({
|
||||
text: followupText,
|
||||
|
|
@ -560,7 +593,7 @@ function extractTranscriptMessageText(message: unknown): string {
|
|||
.trim();
|
||||
}
|
||||
|
||||
function readSessionAssistantTexts(sessionKey: string): string[] {
|
||||
function readSessionAssistantTexts(sessionKey: string, modelKey?: string): string[] {
|
||||
const { storePath, entry } = loadSessionEntry(sessionKey);
|
||||
if (!entry?.sessionId) {
|
||||
return [];
|
||||
|
|
@ -575,7 +608,9 @@ function readSessionAssistantTexts(sessionKey: string): string[] {
|
|||
if (role !== "assistant") {
|
||||
continue;
|
||||
}
|
||||
assistantTexts.push(extractTranscriptMessageText(message));
|
||||
assistantTexts.push(
|
||||
maybeStripAssistantScaffoldingForLiveModel(extractTranscriptMessageText(message), modelKey),
|
||||
);
|
||||
}
|
||||
return assistantTexts;
|
||||
}
|
||||
|
|
@ -584,12 +619,13 @@ async function waitForSessionAssistantText(params: {
|
|||
sessionKey: string;
|
||||
baselineAssistantCount: number;
|
||||
context: string;
|
||||
modelKey?: string;
|
||||
}) {
|
||||
const startedAt = Date.now();
|
||||
let lastHeartbeatAt = startedAt;
|
||||
let delayMs = 50;
|
||||
while (Date.now() - startedAt < GATEWAY_LIVE_PROBE_TIMEOUT_MS) {
|
||||
const assistantTexts = readSessionAssistantTexts(params.sessionKey);
|
||||
const assistantTexts = readSessionAssistantTexts(params.sessionKey, params.modelKey);
|
||||
if (assistantTexts.length > params.baselineAssistantCount) {
|
||||
const freshText = assistantTexts
|
||||
.slice(params.baselineAssistantCount)
|
||||
|
|
@ -618,13 +654,17 @@ async function requestGatewayAgentText(params: {
|
|||
thinkingLevel: string;
|
||||
context: string;
|
||||
idempotencyKey: string;
|
||||
modelKey?: string;
|
||||
attachments?: Array<{
|
||||
mimeType: string;
|
||||
fileName: string;
|
||||
content: string;
|
||||
}>;
|
||||
}) {
|
||||
const baselineAssistantCount = readSessionAssistantTexts(params.sessionKey).length;
|
||||
const baselineAssistantCount = readSessionAssistantTexts(
|
||||
params.sessionKey,
|
||||
params.modelKey,
|
||||
).length;
|
||||
const accepted = await withGatewayLiveProbeTimeout(
|
||||
params.client.request<{ runId?: unknown; status?: unknown }>("agent", {
|
||||
sessionKey: params.sessionKey,
|
||||
|
|
@ -643,6 +683,7 @@ async function requestGatewayAgentText(params: {
|
|||
sessionKey: params.sessionKey,
|
||||
baselineAssistantCount,
|
||||
context: `${params.context}: transcript-final`,
|
||||
modelKey: params.modelKey,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -650,6 +691,7 @@ type GatewayModelSuiteParams = {
|
|||
label: string;
|
||||
cfg: OpenClawConfig;
|
||||
candidates: Array<Model<Api>>;
|
||||
allowNotFoundSkip: boolean;
|
||||
extraToolProbes: boolean;
|
||||
extraImageProbes: boolean;
|
||||
thinkingLevel: string;
|
||||
|
|
@ -935,6 +977,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${randomUUID()}`,
|
||||
modelKey,
|
||||
message:
|
||||
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
|
|
@ -946,6 +989,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${randomUUID()}-retry`,
|
||||
modelKey,
|
||||
message:
|
||||
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
|
|
@ -969,6 +1013,10 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
logProgress(`${progressLabel}: skip (google model not found)`);
|
||||
break;
|
||||
}
|
||||
if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(text)) {
|
||||
logProgress(`${progressLabel}: skip (model not found)`);
|
||||
break;
|
||||
}
|
||||
assertNoReasoningTags({
|
||||
text,
|
||||
model: modelKey,
|
||||
|
|
@ -1001,6 +1049,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-tool-${toolReadAttempt + 1}`,
|
||||
modelKey,
|
||||
message: strictReply
|
||||
? "OpenClaw live tool probe (local, safe): " +
|
||||
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||
|
|
@ -1064,6 +1113,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-exec-read-${execReadAttempt + 1}`,
|
||||
modelKey,
|
||||
message: strictReply
|
||||
? "OpenClaw live tool probe (local, safe): " +
|
||||
"use the tool named `exec` (or `Exec`) to run this command: " +
|
||||
|
|
@ -1128,6 +1178,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdImage}-image`,
|
||||
modelKey,
|
||||
message:
|
||||
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
|
||||
"(1) the animal shown or written in the image, lowercase; " +
|
||||
|
|
@ -1185,6 +1236,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-1`,
|
||||
modelKey,
|
||||
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
context: `${progressLabel}: tool-only-regression-first`,
|
||||
|
|
@ -1200,6 +1252,7 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-2`,
|
||||
modelKey,
|
||||
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
context: `${progressLabel}: tool-only-regression-second`,
|
||||
|
|
@ -1268,11 +1321,27 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
|||
logProgress(`${progressLabel}: skip (google rate limit)`);
|
||||
break;
|
||||
}
|
||||
if (
|
||||
(model.provider === "minimax" ||
|
||||
model.provider === "opencode" ||
|
||||
model.provider === "opencode-go" ||
|
||||
model.provider === "zai") &&
|
||||
isRateLimitErrorMessage(message)
|
||||
) {
|
||||
skippedCount += 1;
|
||||
logProgress(`${progressLabel}: skip (rate limit)`);
|
||||
break;
|
||||
}
|
||||
if (isProviderUnavailableErrorMessage(message)) {
|
||||
skippedCount += 1;
|
||||
logProgress(`${progressLabel}: skip (provider unavailable)`);
|
||||
break;
|
||||
}
|
||||
if (params.allowNotFoundSkip && isModelNotFoundErrorMessage(message)) {
|
||||
skippedCount += 1;
|
||||
logProgress(`${progressLabel}: skip (model not found)`);
|
||||
break;
|
||||
}
|
||||
if (
|
||||
model.provider === "anthropic" &&
|
||||
isGatewayLiveProbeTimeout(message) &&
|
||||
|
|
@ -1448,6 +1517,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
|||
label: "all-models",
|
||||
cfg,
|
||||
candidates: selectedCandidates,
|
||||
allowNotFoundSkip: useModern,
|
||||
extraToolProbes: true,
|
||||
extraImageProbes: true,
|
||||
thinkingLevel: THINKING_LEVEL,
|
||||
|
|
@ -1469,6 +1539,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
|||
label: "minimax-anthropic",
|
||||
cfg,
|
||||
candidates: minimaxCandidates,
|
||||
allowNotFoundSkip: useModern,
|
||||
extraToolProbes: true,
|
||||
extraImageProbes: true,
|
||||
thinkingLevel: THINKING_LEVEL,
|
||||
|
|
@ -1589,6 +1660,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${randomUUID()}-tool`,
|
||||
modelKey: "anthropic/claude-opus-4-5",
|
||||
message:
|
||||
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
|
||||
|
|
@ -1617,6 +1689,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
|||
client,
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${randomUUID()}-followup`,
|
||||
modelKey: "zai/glm-4.7",
|
||||
message:
|
||||
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
|
||||
`Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||
|
|
|
|||
|
|
@ -556,6 +556,39 @@ describe("readSessionMessages", () => {
|
|||
expect((out[0] as { __openclaw?: { seq?: number } }).__openclaw?.seq).toBe(1);
|
||||
}
|
||||
});
|
||||
|
||||
test("preserves raw assistant transcript content on disk reads", () => {
|
||||
const sessionId = "assistant-scaffolding";
|
||||
const transcriptPath = path.join(tmpDir, `${sessionId}.jsonl`);
|
||||
fs.writeFileSync(
|
||||
transcriptPath,
|
||||
[
|
||||
JSON.stringify({ type: "session", version: 1, id: sessionId }),
|
||||
JSON.stringify({
|
||||
message: {
|
||||
role: "assistant",
|
||||
text: "<think>hidden</think>Visible top-level",
|
||||
content: [
|
||||
{ type: "text", text: "<think>secret</think>Visible content" },
|
||||
{ type: "tool_result", text: "<think>keep?</think>Visible tool text" },
|
||||
],
|
||||
},
|
||||
}),
|
||||
].join("\n"),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const out = readSessionMessages(sessionId, storePath);
|
||||
expect(out).toHaveLength(1);
|
||||
expect(out[0]).toMatchObject({
|
||||
role: "assistant",
|
||||
text: "<think>hidden</think>Visible top-level",
|
||||
content: [
|
||||
{ type: "text", text: "<think>secret</think>Visible content" },
|
||||
{ type: "tool_result", text: "<think>keep?</think>Visible tool text" },
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("readSessionPreviewItemsFromTranscript", () => {
|
||||
|
|
|
|||
Loading…
Reference in New Issue