From 979409eab5c05e83aaba03fd2c1d6ed4c1f45244 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 6 Apr 2026 02:38:19 +0100 Subject: [PATCH] fix(qa): harden new scenario suite --- extensions/qa-lab/src/gateway-child.ts | 75 +- .../qa-lab/src/mock-openai-server.test.ts | 200 ++++ extensions/qa-lab/src/mock-openai-server.ts | 168 +++- extensions/qa-lab/src/qa-gateway-config.ts | 31 + extensions/qa-lab/src/suite.ts | 862 +++++++++++++++++- qa/new-scenarios-2026-04.md | 150 +++ qa/seed-scenarios.json | 136 +++ 7 files changed, 1596 insertions(+), 26 deletions(-) create mode 100644 qa/new-scenarios-2026-04.md diff --git a/extensions/qa-lab/src/gateway-child.ts b/extensions/qa-lab/src/gateway-child.ts index 6789cf7ae17..9685a4b44b8 100644 --- a/extensions/qa-lab/src/gateway-child.ts +++ b/extensions/qa-lab/src/gateway-child.ts @@ -24,6 +24,53 @@ async function getFreePort() { }); } +function buildQaRuntimeEnv(params: { + configPath: string; + gatewayToken: string; + homeDir: string; + stateDir: string; + xdgConfigHome: string; + xdgDataHome: string; + xdgCacheHome: string; + providerMode?: "mock-openai" | "live-openai"; +}) { + const env: NodeJS.ProcessEnv = { + ...process.env, + HOME: params.homeDir, + OPENCLAW_HOME: params.homeDir, + OPENCLAW_CONFIG_PATH: params.configPath, + OPENCLAW_STATE_DIR: params.stateDir, + OPENCLAW_OAUTH_DIR: path.join(params.stateDir, "credentials"), + OPENCLAW_GATEWAY_TOKEN: params.gatewayToken, + OPENCLAW_SKIP_BROWSER_CONTROL_SERVER: "1", + OPENCLAW_SKIP_GMAIL_WATCHER: "1", + OPENCLAW_SKIP_CANVAS_HOST: "1", + OPENCLAW_NO_RESPAWN: "1", + OPENCLAW_TEST_FAST: "1", + XDG_CONFIG_HOME: params.xdgConfigHome, + XDG_DATA_HOME: params.xdgDataHome, + XDG_CACHE_HOME: params.xdgCacheHome, + }; + if (params.providerMode === "mock-openai") { + for (const key of [ + "OPENAI_API_KEY", + "OPENAI_BASE_URL", + "GEMINI_API_KEY", + "GOOGLE_API_KEY", + "VOYAGE_API_KEY", + "MISTRAL_API_KEY", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + "AWS_REGION", + "AWS_BEARER_TOKEN_BEDROCK", + ]) { + delete env[key]; + } + } + return env; +} + async function waitForGatewayReady(baseUrl: string, logs: () => string, timeoutMs = 30_000) { const startedAt = Date.now(); while (Date.now() - startedAt < timeoutMs) { @@ -116,23 +163,16 @@ export async function startQaGatewayChild(params: { const stdout: Buffer[] = []; const stderr: Buffer[] = []; - const env = { - ...process.env, - HOME: homeDir, - OPENCLAW_HOME: homeDir, - OPENCLAW_CONFIG_PATH: configPath, - OPENCLAW_STATE_DIR: stateDir, - OPENCLAW_OAUTH_DIR: path.join(stateDir, "credentials"), - OPENCLAW_GATEWAY_TOKEN: gatewayToken, - OPENCLAW_SKIP_BROWSER_CONTROL_SERVER: "1", - OPENCLAW_SKIP_GMAIL_WATCHER: "1", - OPENCLAW_SKIP_CANVAS_HOST: "1", - OPENCLAW_NO_RESPAWN: "1", - OPENCLAW_TEST_FAST: "1", - XDG_CONFIG_HOME: xdgConfigHome, - XDG_DATA_HOME: xdgDataHome, - XDG_CACHE_HOME: xdgCacheHome, - }; + const env = buildQaRuntimeEnv({ + configPath, + gatewayToken, + homeDir, + stateDir, + xdgConfigHome, + xdgDataHome, + xdgCacheHome, + providerMode: params.providerMode, + }); const child = spawn( process.execPath, @@ -176,6 +216,7 @@ export async function startQaGatewayChild(params: { workspaceDir, tempRoot, configPath, + runtimeEnv: env, logs, async call( method: string, diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts index af527adaa6e..f4447d19511 100644 --- a/extensions/qa-lab/src/mock-openai-server.test.ts +++ b/extensions/qa-lab/src/mock-openai-server.test.ts @@ -82,6 +82,8 @@ describe("qa mock openai server", () => { expect(debugResponse.status).toBe(200); expect(await debugResponse.json()).toMatchObject({ prompt: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.', + allInputText: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.', + plannedToolName: "read", }); }); @@ -200,4 +202,202 @@ describe("qa mock openai server", () => { expect(body).toContain('\\"label\\":\\"qa-sidecar\\"'); expect(body).toContain('\\"thread\\":false'); }); + + it("plans memory tools and serves mock image generations", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const memorySearch = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.", + }, + ], + }, + ], + }), + }); + expect(memorySearch.status).toBe(200); + expect(await memorySearch.text()).toContain('"name":"memory_search"'); + + const image = await fetch(`${server.baseUrl}/v1/images/generations`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + model: "gpt-image-1", + prompt: "Draw a QA lighthouse", + n: 1, + size: "1024x1024", + }), + }); + expect(image.status).toBe(200); + expect(await image.json()).toMatchObject({ + data: [{ b64_json: expect.any(String) }], + }); + }); + + it("returns exact markers for visible and hot-installed skills", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const visible = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Visible skill marker: give me the visible skill marker exactly.", + }, + ], + }, + ], + }), + }); + expect(visible.status).toBe(200); + expect(await visible.json()).toMatchObject({ + output: [ + { + content: [{ text: "VISIBLE-SKILL-OK" }], + }, + ], + }); + + const hot = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: "Hot install marker: give me the hot install marker exactly.", + }, + ], + }, + ], + }), + }); + expect(hot.status).toBe(200); + expect(await hot.json()).toMatchObject({ + output: [ + { + content: [{ text: "HOT-INSTALL-OK" }], + }, + ], + }); + }); + + it("ignores stale tool output from prior turns when planning the current turn", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: true, + input: [ + { + role: "user", + content: [{ type: "input_text", text: "Read QA_KICKOFF_TASK.md first." }], + }, + { + type: "function_call_output", + output: "QA mission: read source and docs first.", + }, + { + role: "user", + content: [ + { + type: "input_text", + text: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.", + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + expect(await response.text()).toContain('"name":"read"'); + }); + + it("returns NO_REPLY for unmentioned group chatter", async () => { + const server = await startQaMockOpenAiServer({ + host: "127.0.0.1", + port: 0, + }); + cleanups.push(async () => { + await server.stop(); + }); + + const response = await fetch(`${server.baseUrl}/v1/responses`, { + method: "POST", + headers: { + "content-type": "application/json", + }, + body: JSON.stringify({ + stream: false, + input: [ + { + role: "user", + content: [ + { + type: "input_text", + text: 'Conversation info (untrusted metadata): {"is_group_chat": true}\n\nhello team, no bot ping here', + }, + ], + }, + ], + }), + }); + expect(response.status).toBe(200); + expect(await response.json()).toMatchObject({ + output: [ + { + content: [{ text: "NO_REPLY" }], + }, + ], + }); + }); }); diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts index 3c6591b78d6..b7f7557cddc 100644 --- a/extensions/qa-lab/src/mock-openai-server.ts +++ b/extensions/qa-lab/src/mock-openai-server.ts @@ -24,10 +24,15 @@ type MockOpenAiRequestSnapshot = { raw: string; body: Record; prompt: string; + allInputText: string; toolOutput: string; model: string; + plannedToolName?: string; }; +const TINY_PNG_BASE64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII="; + function readBody(req: IncomingMessage): Promise { return new Promise((resolve, reject) => { const chunks: Buffer[] = []; @@ -82,8 +87,19 @@ function extractLastUserText(input: ResponsesInputItem[]) { return ""; } -function extractToolOutput(input: ResponsesInputItem[]) { +function findLastUserIndex(input: ResponsesInputItem[]) { for (let index = input.length - 1; index >= 0; index -= 1) { + const item = input[index]; + if (item.role === "user" && Array.isArray(item.content)) { + return index; + } + } + return -1; +} + +function extractToolOutput(input: ResponsesInputItem[]) { + const lastUserIndex = findLastUserIndex(input); + for (let index = input.length - 1; index > lastUserIndex; index -= 1) { const item = input[index]; if (item.type === "function_call_output" && typeof item.output === "string" && item.output) { return item.output; @@ -116,6 +132,44 @@ function extractAllUserTexts(input: ResponsesInputItem[]) { return texts; } +function extractAllInputTexts(input: ResponsesInputItem[]) { + const texts: string[] = []; + for (const item of input) { + if (typeof item.output === "string" && item.output.trim()) { + texts.push(item.output.trim()); + } + if (!Array.isArray(item.content)) { + continue; + } + const text = item.content + .filter( + (entry): entry is { type: "input_text"; text: string } => + !!entry && + typeof entry === "object" && + (entry as { type?: unknown }).type === "input_text" && + typeof (entry as { text?: unknown }).text === "string", + ) + .map((entry) => entry.text) + .join("\n") + .trim(); + if (text) { + texts.push(text); + } + } + return texts.join("\n"); +} + +function parseToolOutputJson(toolOutput: string): Record | null { + if (!toolOutput.trim()) { + return null; + } + try { + return JSON.parse(toolOutput) as Record; + } catch { + return null; + } +} + function normalizePromptPathCandidate(candidate: string) { const trimmed = candidate.trim().replace(/^`+|`+$/g, ""); if (!trimmed) { @@ -221,12 +275,26 @@ function extractRememberedFact(userTexts: string[]) { return null; } +function extractOrbitCode(text: string) { + return /\b(?:ORBIT-9|orbit-9)\b/.exec(text)?.[0]?.toUpperCase() ?? null; +} + function buildAssistantText(input: ResponsesInputItem[], body: Record) { const prompt = extractLastUserText(input); const toolOutput = extractToolOutput(input); + const toolJson = parseToolOutputJson(toolOutput); const userTexts = extractAllUserTexts(input); + const allInputText = extractAllInputTexts(input); const rememberedFact = extractRememberedFact(userTexts); const model = typeof body.model === "string" ? body.model : ""; + const memorySnippet = + typeof toolJson?.text === "string" + ? toolJson.text + : Array.isArray(toolJson?.results) + ? JSON.stringify(toolJson.results) + : toolOutput; + const orbitCode = extractOrbitCode(memorySnippet); + const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim(); if (/what was the qa canary code/i.test(prompt) && rememberedFact) { return `Protocol note: the QA canary code was ${rememberedFact}.`; @@ -234,9 +302,27 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record