fix(qa): harden new scenario suite

This commit is contained in:
Peter Steinberger 2026-04-06 02:38:19 +01:00
parent 80c5df6bdc
commit 979409eab5
No known key found for this signature in database
7 changed files with 1596 additions and 26 deletions

View File

@ -24,6 +24,53 @@ async function getFreePort() {
});
}
function buildQaRuntimeEnv(params: {
configPath: string;
gatewayToken: string;
homeDir: string;
stateDir: string;
xdgConfigHome: string;
xdgDataHome: string;
xdgCacheHome: string;
providerMode?: "mock-openai" | "live-openai";
}) {
const env: NodeJS.ProcessEnv = {
...process.env,
HOME: params.homeDir,
OPENCLAW_HOME: params.homeDir,
OPENCLAW_CONFIG_PATH: params.configPath,
OPENCLAW_STATE_DIR: params.stateDir,
OPENCLAW_OAUTH_DIR: path.join(params.stateDir, "credentials"),
OPENCLAW_GATEWAY_TOKEN: params.gatewayToken,
OPENCLAW_SKIP_BROWSER_CONTROL_SERVER: "1",
OPENCLAW_SKIP_GMAIL_WATCHER: "1",
OPENCLAW_SKIP_CANVAS_HOST: "1",
OPENCLAW_NO_RESPAWN: "1",
OPENCLAW_TEST_FAST: "1",
XDG_CONFIG_HOME: params.xdgConfigHome,
XDG_DATA_HOME: params.xdgDataHome,
XDG_CACHE_HOME: params.xdgCacheHome,
};
if (params.providerMode === "mock-openai") {
for (const key of [
"OPENAI_API_KEY",
"OPENAI_BASE_URL",
"GEMINI_API_KEY",
"GOOGLE_API_KEY",
"VOYAGE_API_KEY",
"MISTRAL_API_KEY",
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
"AWS_REGION",
"AWS_BEARER_TOKEN_BEDROCK",
]) {
delete env[key];
}
}
return env;
}
async function waitForGatewayReady(baseUrl: string, logs: () => string, timeoutMs = 30_000) {
const startedAt = Date.now();
while (Date.now() - startedAt < timeoutMs) {
@ -116,23 +163,16 @@ export async function startQaGatewayChild(params: {
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
const env = {
...process.env,
HOME: homeDir,
OPENCLAW_HOME: homeDir,
OPENCLAW_CONFIG_PATH: configPath,
OPENCLAW_STATE_DIR: stateDir,
OPENCLAW_OAUTH_DIR: path.join(stateDir, "credentials"),
OPENCLAW_GATEWAY_TOKEN: gatewayToken,
OPENCLAW_SKIP_BROWSER_CONTROL_SERVER: "1",
OPENCLAW_SKIP_GMAIL_WATCHER: "1",
OPENCLAW_SKIP_CANVAS_HOST: "1",
OPENCLAW_NO_RESPAWN: "1",
OPENCLAW_TEST_FAST: "1",
XDG_CONFIG_HOME: xdgConfigHome,
XDG_DATA_HOME: xdgDataHome,
XDG_CACHE_HOME: xdgCacheHome,
};
const env = buildQaRuntimeEnv({
configPath,
gatewayToken,
homeDir,
stateDir,
xdgConfigHome,
xdgDataHome,
xdgCacheHome,
providerMode: params.providerMode,
});
const child = spawn(
process.execPath,
@ -176,6 +216,7 @@ export async function startQaGatewayChild(params: {
workspaceDir,
tempRoot,
configPath,
runtimeEnv: env,
logs,
async call(
method: string,

View File

@ -82,6 +82,8 @@ describe("qa mock openai server", () => {
expect(debugResponse.status).toBe(200);
expect(await debugResponse.json()).toMatchObject({
prompt: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.',
allInputText: 'Please inspect "message_id" metadata first, then read `./QA_KICKOFF_TASK.md`.',
plannedToolName: "read",
});
});
@ -200,4 +202,202 @@ describe("qa mock openai server", () => {
expect(body).toContain('\\"label\\":\\"qa-sidecar\\"');
expect(body).toContain('\\"thread\\":false');
});
it("plans memory tools and serves mock image generations", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const memorySearch = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: "Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.",
},
],
},
],
}),
});
expect(memorySearch.status).toBe(200);
expect(await memorySearch.text()).toContain('"name":"memory_search"');
const image = await fetch(`${server.baseUrl}/v1/images/generations`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
model: "gpt-image-1",
prompt: "Draw a QA lighthouse",
n: 1,
size: "1024x1024",
}),
});
expect(image.status).toBe(200);
expect(await image.json()).toMatchObject({
data: [{ b64_json: expect.any(String) }],
});
});
it("returns exact markers for visible and hot-installed skills", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const visible = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: false,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: "Visible skill marker: give me the visible skill marker exactly.",
},
],
},
],
}),
});
expect(visible.status).toBe(200);
expect(await visible.json()).toMatchObject({
output: [
{
content: [{ text: "VISIBLE-SKILL-OK" }],
},
],
});
const hot = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: false,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: "Hot install marker: give me the hot install marker exactly.",
},
],
},
],
}),
});
expect(hot.status).toBe(200);
expect(await hot.json()).toMatchObject({
output: [
{
content: [{ text: "HOT-INSTALL-OK" }],
},
],
});
});
it("ignores stale tool output from prior turns when planning the current turn", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: true,
input: [
{
role: "user",
content: [{ type: "input_text", text: "Read QA_KICKOFF_TASK.md first." }],
},
{
type: "function_call_output",
output: "QA mission: read source and docs first.",
},
{
role: "user",
content: [
{
type: "input_text",
text: "Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.",
},
],
},
],
}),
});
expect(response.status).toBe(200);
expect(await response.text()).toContain('"name":"read"');
});
it("returns NO_REPLY for unmentioned group chatter", async () => {
const server = await startQaMockOpenAiServer({
host: "127.0.0.1",
port: 0,
});
cleanups.push(async () => {
await server.stop();
});
const response = await fetch(`${server.baseUrl}/v1/responses`, {
method: "POST",
headers: {
"content-type": "application/json",
},
body: JSON.stringify({
stream: false,
input: [
{
role: "user",
content: [
{
type: "input_text",
text: 'Conversation info (untrusted metadata): {"is_group_chat": true}\n\nhello team, no bot ping here',
},
],
},
],
}),
});
expect(response.status).toBe(200);
expect(await response.json()).toMatchObject({
output: [
{
content: [{ text: "NO_REPLY" }],
},
],
});
});
});

View File

@ -24,10 +24,15 @@ type MockOpenAiRequestSnapshot = {
raw: string;
body: Record<string, unknown>;
prompt: string;
allInputText: string;
toolOutput: string;
model: string;
plannedToolName?: string;
};
const TINY_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
function readBody(req: IncomingMessage): Promise<string> {
return new Promise((resolve, reject) => {
const chunks: Buffer[] = [];
@ -82,8 +87,19 @@ function extractLastUserText(input: ResponsesInputItem[]) {
return "";
}
function extractToolOutput(input: ResponsesInputItem[]) {
function findLastUserIndex(input: ResponsesInputItem[]) {
for (let index = input.length - 1; index >= 0; index -= 1) {
const item = input[index];
if (item.role === "user" && Array.isArray(item.content)) {
return index;
}
}
return -1;
}
function extractToolOutput(input: ResponsesInputItem[]) {
const lastUserIndex = findLastUserIndex(input);
for (let index = input.length - 1; index > lastUserIndex; index -= 1) {
const item = input[index];
if (item.type === "function_call_output" && typeof item.output === "string" && item.output) {
return item.output;
@ -116,6 +132,44 @@ function extractAllUserTexts(input: ResponsesInputItem[]) {
return texts;
}
function extractAllInputTexts(input: ResponsesInputItem[]) {
const texts: string[] = [];
for (const item of input) {
if (typeof item.output === "string" && item.output.trim()) {
texts.push(item.output.trim());
}
if (!Array.isArray(item.content)) {
continue;
}
const text = item.content
.filter(
(entry): entry is { type: "input_text"; text: string } =>
!!entry &&
typeof entry === "object" &&
(entry as { type?: unknown }).type === "input_text" &&
typeof (entry as { text?: unknown }).text === "string",
)
.map((entry) => entry.text)
.join("\n")
.trim();
if (text) {
texts.push(text);
}
}
return texts.join("\n");
}
function parseToolOutputJson(toolOutput: string): Record<string, unknown> | null {
if (!toolOutput.trim()) {
return null;
}
try {
return JSON.parse(toolOutput) as Record<string, unknown>;
} catch {
return null;
}
}
function normalizePromptPathCandidate(candidate: string) {
const trimmed = candidate.trim().replace(/^`+|`+$/g, "");
if (!trimmed) {
@ -221,12 +275,26 @@ function extractRememberedFact(userTexts: string[]) {
return null;
}
function extractOrbitCode(text: string) {
return /\b(?:ORBIT-9|orbit-9)\b/.exec(text)?.[0]?.toUpperCase() ?? null;
}
function buildAssistantText(input: ResponsesInputItem[], body: Record<string, unknown>) {
const prompt = extractLastUserText(input);
const toolOutput = extractToolOutput(input);
const toolJson = parseToolOutputJson(toolOutput);
const userTexts = extractAllUserTexts(input);
const allInputText = extractAllInputTexts(input);
const rememberedFact = extractRememberedFact(userTexts);
const model = typeof body.model === "string" ? body.model : "";
const memorySnippet =
typeof toolJson?.text === "string"
? toolJson.text
: Array.isArray(toolJson?.results)
? JSON.stringify(toolJson.results)
: toolOutput;
const orbitCode = extractOrbitCode(memorySnippet);
const mediaPath = /MEDIA:([^\n]+)/.exec(toolOutput)?.[1]?.trim();
if (/what was the qa canary code/i.test(prompt) && rememberedFact) {
return `Protocol note: the QA canary code was ${rememberedFact}.`;
@ -234,9 +302,27 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
if (/remember this fact/i.test(prompt) && rememberedFact) {
return `Protocol note: acknowledged. I will remember ${rememberedFact}.`;
}
if (/memory unavailable check/i.test(prompt)) {
return "Protocol note: I checked the available runtime context but could not confirm the hidden memory-only fact, so I will not guess.";
}
if (/visible skill marker/i.test(prompt)) {
return "VISIBLE-SKILL-OK";
}
if (/hot install marker/i.test(prompt)) {
return "HOT-INSTALL-OK";
}
if (/memory tools check/i.test(prompt) && orbitCode) {
return `Protocol note: I checked memory and the project codename is ${orbitCode}.`;
}
if (/switch(?:ing)? models?/i.test(prompt)) {
return `Protocol note: model switch acknowledged. Continuing on ${model || "the requested model"}.`;
}
if (/tool continuity check/i.test(prompt) && toolOutput) {
return `Protocol note: model switch acknowledged. Tool continuity held on ${model || "the requested model"}.`;
}
if (/image generation check/i.test(prompt) && mediaPath) {
return `Protocol note: generated the QA lighthouse image successfully.\nMEDIA:${mediaPath}`;
}
if (toolOutput && /delegate|subagent/i.test(prompt)) {
return `Protocol note: delegated result acknowledged. The bounded subagent task returned and is folded back into the main thread.`;
}
@ -264,6 +350,19 @@ function buildToolCallEvents(prompt: string): StreamEvent[] {
return buildToolCallEventsWithArgs("read", { path: targetPath });
}
function extractPlannedToolName(events: StreamEvent[]) {
for (const event of events) {
if (event.type !== "response.output_item.done") {
continue;
}
const item = event.item as { type?: unknown; name?: unknown };
if (item.type === "function_call" && typeof item.name === "string") {
return item.name;
}
}
return undefined;
}
function buildAssistantEvents(text: string): StreamEvent[] {
const outputItem = {
type: "message",
@ -303,6 +402,10 @@ function buildResponsesPayload(body: Record<string, unknown>) {
const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
const prompt = extractLastUserText(input);
const toolOutput = extractToolOutput(input);
const toolJson = parseToolOutputJson(toolOutput);
const allInputText = extractAllInputTexts(input);
const isGroupChat = allInputText.includes('"is_group_chat": true');
const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
if (/lobster invaders/i.test(prompt)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
@ -318,6 +421,44 @@ function buildResponsesPayload(body: Record<string, unknown>) {
});
}
}
if (/memory tools check/i.test(prompt)) {
if (!toolOutput) {
return buildToolCallEventsWithArgs("memory_search", {
query: "project codename ORBIT-9",
maxResults: 3,
});
}
const results = Array.isArray(toolJson?.results)
? (toolJson.results as Array<Record<string, unknown>>)
: [];
const first = results[0];
if (
typeof first?.path === "string" &&
(typeof first.startLine === "number" || typeof first.endLine === "number")
) {
const from =
typeof first.startLine === "number"
? Math.max(1, first.startLine)
: typeof first.endLine === "number"
? Math.max(1, first.endLine)
: 1;
return buildToolCallEventsWithArgs("memory_get", {
path: first.path,
from,
lines: 4,
});
}
}
if (/image generation check/i.test(prompt) && !toolOutput) {
return buildToolCallEventsWithArgs("image_generate", {
prompt: "A QA lighthouse on a dark sea with a tiny protocol droid silhouette.",
filename: "qa-lighthouse.png",
size: "1024x1024",
});
}
if (/tool continuity check/i.test(prompt) && !toolOutput) {
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
}
if (/delegate|subagent/i.test(prompt) && !toolOutput) {
return buildToolCallEventsWithArgs("sessions_spawn", {
task: "Inspect the QA workspace and return one concise protocol note.",
@ -334,6 +475,15 @@ function buildResponsesPayload(body: Record<string, unknown>) {
if (!toolOutput && /\b(read|inspect|repo|docs|scenario|kickoff)\b/i.test(prompt)) {
return buildToolCallEvents(prompt);
}
if (/visible skill marker/i.test(prompt) && !toolOutput) {
return buildAssistantEvents("VISIBLE-SKILL-OK");
}
if (/hot install marker/i.test(prompt) && !toolOutput) {
return buildAssistantEvents("HOT-INSTALL-OK");
}
if (isGroupChat && isBaselineUnmentionedChannelChatter && !toolOutput) {
return buildAssistantEvents("NO_REPLY");
}
return buildAssistantEvents(buildAssistantText(input, body));
}
@ -352,6 +502,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
data: [
{ id: "gpt-5.4", object: "model" },
{ id: "gpt-5.4-alt", object: "model" },
{ id: "gpt-image-1", object: "model" },
],
});
return;
@ -364,22 +515,35 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
writeJson(res, 200, requests);
return;
}
if (req.method === "POST" && url.pathname === "/v1/images/generations") {
writeJson(res, 200, {
data: [
{
b64_json: TINY_PNG_BASE64,
revised_prompt: "A QA lighthouse with protocol droid silhouette.",
},
],
});
return;
}
if (req.method === "POST" && url.pathname === "/v1/responses") {
const raw = await readBody(req);
const body = raw ? (JSON.parse(raw) as Record<string, unknown>) : {};
const input = Array.isArray(body.input) ? (body.input as ResponsesInputItem[]) : [];
const events = buildResponsesPayload(body);
lastRequest = {
raw,
body,
prompt: extractLastUserText(input),
allInputText: extractAllInputTexts(input),
toolOutput: extractToolOutput(input),
model: typeof body.model === "string" ? body.model : "",
plannedToolName: extractPlannedToolName(events),
};
requests.push(lastRequest);
if (requests.length > 50) {
requests.splice(0, requests.length - 50);
}
const events = buildResponsesPayload(body);
if (body.stream === false) {
const completion = events.at(-1);
if (!completion || completion.type !== "response.completed") {

View File

@ -74,6 +74,21 @@ export function buildQaGatewayConfig(params: {
contextWindow: 128_000,
maxTokens: 4096,
},
{
id: "gpt-image-1",
name: "gpt-image-1",
api: "openai-responses",
reasoning: false,
input: ["text"],
cost: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
},
contextWindow: 128_000,
maxTokens: 4096,
},
],
};
const providerMode = params.providerMode ?? "mock-openai";
@ -87,6 +102,8 @@ export function buildQaGatewayConfig(params: {
const alternateModel =
params.alternateModel ??
(providerMode === "live-openai" ? "openai/gpt-5.4" : "mock-openai/gpt-5.4-alt");
const imageGenerationModelRef =
providerMode === "live-openai" ? "openai/gpt-image-1" : "mock-openai/gpt-image-1";
const liveModelParams =
providerMode === "live-openai"
? {
@ -133,6 +150,17 @@ export function buildQaGatewayConfig(params: {
model: {
primary: primaryModel,
},
imageGenerationModel: {
primary: imageGenerationModelRef,
},
memorySearch: {
sync: {
watch: true,
watchDebounceMs: 25,
onSessionStart: true,
onSearch: true,
},
},
models: {
[primaryModel]: {
params: liveModelParams,
@ -165,6 +193,9 @@ export function buildQaGatewayConfig(params: {
},
],
},
memory: {
backend: "builtin",
},
...(providerMode === "mock-openai"
? {
models: {

View File

@ -1,7 +1,10 @@
import { spawn } from "node:child_process";
import { randomUUID } from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { setTimeout as sleep } from "node:timers/promises";
import { Client } from "@modelcontextprotocol/sdk/client/index.js";
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
import type { OpenClawConfig } from "openclaw/plugin-sdk/core";
import type { QaBusState } from "./bus-state.js";
import { extractQaToolPayload } from "./extract-tool-payload.js";
@ -35,6 +38,18 @@ type QaSuiteEnvironment = {
alternateModel: string;
};
type QaSkillStatusEntry = {
name?: string;
eligible?: boolean;
disabled?: boolean;
blockedByAllowlist?: boolean;
};
type QaConfigSnapshot = {
hash?: string;
config?: Record<string, unknown>;
};
function splitModelRef(ref: string) {
const slash = ref.indexOf("/");
if (slash <= 0 || slash === ref.length - 1) {
@ -138,7 +153,13 @@ async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteS
const stepResults: QaReportCheck[] = [];
for (const step of steps) {
try {
if (process.env.OPENCLAW_QA_DEBUG === "1") {
console.error(`[qa-suite] start scenario="${name}" step="${step.name}"`);
}
const details = await step.run();
if (process.env.OPENCLAW_QA_DEBUG === "1") {
console.error(`[qa-suite] pass scenario="${name}" step="${step.name}"`);
}
stepResults.push({
name: step.name,
status: "pass",
@ -146,6 +167,9 @@ async function runScenario(name: string, steps: QaSuiteStep[]): Promise<QaSuiteS
});
} catch (error) {
const details = error instanceof Error ? error.message : String(error);
if (process.env.OPENCLAW_QA_DEBUG === "1") {
console.error(`[qa-suite] fail scenario="${name}" step="${step.name}" details=${details}`);
}
stepResults.push({
name: step.name,
status: "fail",
@ -174,6 +198,264 @@ async function fetchJson<T>(url: string): Promise<T> {
return (await response.json()) as T;
}
async function waitForGatewayHealthy(env: QaSuiteEnvironment, timeoutMs = 45_000) {
await waitForCondition(
async () => {
try {
const response = await fetch(`${env.gateway.baseUrl}/readyz`);
return response.ok ? true : undefined;
} catch {
return undefined;
}
},
timeoutMs,
250,
);
}
function isGatewayRestartRace(error: unknown) {
const text = error instanceof Error ? error.message : String(error);
return (
text.includes("gateway closed (1012)") ||
text.includes("gateway closed (1006") ||
text.includes("abnormal closure") ||
text.includes("service restart")
);
}
async function readConfigSnapshot(env: QaSuiteEnvironment) {
const snapshot = (await env.gateway.call("config.get", {})) as QaConfigSnapshot;
if (!snapshot.hash || !snapshot.config) {
throw new Error("config.get returned no hash/config");
}
return {
hash: snapshot.hash,
config: snapshot.config,
} satisfies { hash: string; config: Record<string, unknown> };
}
async function patchConfig(params: {
env: QaSuiteEnvironment;
patch: Record<string, unknown>;
sessionKey?: string;
note?: string;
restartDelayMs?: number;
}) {
const snapshot = await readConfigSnapshot(params.env);
try {
return await params.env.gateway.call(
"config.patch",
{
raw: JSON.stringify(params.patch, null, 2),
baseHash: snapshot.hash,
...(params.sessionKey ? { sessionKey: params.sessionKey } : {}),
...(params.note ? { note: params.note } : {}),
restartDelayMs: params.restartDelayMs ?? 1_000,
},
{ timeoutMs: 45_000 },
);
} catch (error) {
if (!isGatewayRestartRace(error)) {
throw error;
}
await waitForGatewayHealthy(params.env);
return { ok: true, restarted: true };
}
}
async function applyConfig(params: {
env: QaSuiteEnvironment;
nextConfig: Record<string, unknown>;
sessionKey?: string;
note?: string;
restartDelayMs?: number;
}) {
const snapshot = await readConfigSnapshot(params.env);
try {
return await params.env.gateway.call(
"config.apply",
{
raw: JSON.stringify(params.nextConfig, null, 2),
baseHash: snapshot.hash,
...(params.sessionKey ? { sessionKey: params.sessionKey } : {}),
...(params.note ? { note: params.note } : {}),
restartDelayMs: params.restartDelayMs ?? 1_000,
},
{ timeoutMs: 45_000 },
);
} catch (error) {
if (!isGatewayRestartRace(error)) {
throw error;
}
await waitForGatewayHealthy(params.env);
return { ok: true, restarted: true };
}
}
async function createSession(env: QaSuiteEnvironment, label: string, key?: string) {
const created = (await env.gateway.call("sessions.create", {
label,
...(key ? { key } : {}),
})) as { key?: string };
const sessionKey = created.key?.trim();
if (!sessionKey) {
throw new Error("sessions.create returned no key");
}
return sessionKey;
}
async function readEffectiveTools(env: QaSuiteEnvironment, sessionKey: string) {
const payload = (await env.gateway.call(
"tools.effective",
{
sessionKey,
},
{
timeoutMs: liveTurnTimeoutMs(env, 90_000),
},
)) as {
groups?: Array<{ tools?: Array<{ id?: string }> }>;
};
const ids = new Set<string>();
for (const group of payload.groups ?? []) {
for (const tool of group.tools ?? []) {
if (tool.id?.trim()) {
ids.add(tool.id.trim());
}
}
}
return ids;
}
async function readSkillStatus(env: QaSuiteEnvironment, agentId = "qa") {
const payload = (await env.gateway.call(
"skills.status",
{
agentId,
},
{
timeoutMs: liveTurnTimeoutMs(env, 45_000),
},
)) as {
skills?: QaSkillStatusEntry[];
};
return payload.skills ?? [];
}
async function runQaCli(
env: QaSuiteEnvironment,
args: string[],
opts?: { timeoutMs?: number; json?: boolean },
) {
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
await new Promise<void>((resolve, reject) => {
const child = spawn(process.execPath, ["dist/index.js", ...args], {
cwd: process.cwd(),
env: env.gateway.runtimeEnv,
stdio: ["ignore", "pipe", "pipe"],
});
const timeout = setTimeout(() => {
child.kill("SIGKILL");
reject(new Error(`qa cli timed out: openclaw ${args.join(" ")}`));
}, opts?.timeoutMs ?? 60_000);
child.stdout.on("data", (chunk) => stdout.push(Buffer.from(chunk)));
child.stderr.on("data", (chunk) => stderr.push(Buffer.from(chunk)));
child.once("error", (error) => {
clearTimeout(timeout);
reject(error);
});
child.once("exit", (code) => {
clearTimeout(timeout);
if (code === 0) {
resolve();
return;
}
reject(
new Error(
`qa cli failed (${code ?? "unknown"}): ${Buffer.concat(stderr).toString("utf8").trim()}`,
),
);
});
});
const text = Buffer.concat(stdout).toString("utf8").trim();
if (!opts?.json) {
return text;
}
return text ? (JSON.parse(text) as unknown) : {};
}
async function forceMemoryIndex(params: {
env: QaSuiteEnvironment;
query: string;
expectedNeedle: string;
}) {
await runQaCli(params.env, ["memory", "index", "--agent", "qa", "--force"], {
timeoutMs: liveTurnTimeoutMs(params.env, 60_000),
});
const payload = (await runQaCli(
params.env,
["memory", "search", "--agent", "qa", "--json", "--query", params.query],
{
timeoutMs: liveTurnTimeoutMs(params.env, 20_000),
json: true,
},
)) as { results?: Array<{ snippet?: string; text?: string; path?: string }> };
const haystack = JSON.stringify(payload.results ?? []);
if (!haystack.includes(params.expectedNeedle)) {
throw new Error(`memory index missing expected fact after reindex: ${haystack}`);
}
}
function findSkill(skills: QaSkillStatusEntry[], name: string) {
return skills.find((skill) => skill.name === name);
}
async function writeWorkspaceSkill(params: {
env: QaSuiteEnvironment;
name: string;
body: string;
}) {
const skillDir = path.join(params.env.gateway.workspaceDir, "skills", params.name);
await fs.mkdir(skillDir, { recursive: true });
const skillPath = path.join(skillDir, "SKILL.md");
await fs.writeFile(skillPath, `${params.body.trim()}\n`, "utf8");
return skillPath;
}
async function callPluginToolsMcp(params: {
env: QaSuiteEnvironment;
toolName: string;
args: Record<string, unknown>;
}) {
const transportEnv = Object.fromEntries(
Object.entries(params.env.gateway.runtimeEnv).filter(
(entry): entry is [string, string] => typeof entry[1] === "string",
),
);
const transport = new StdioClientTransport({
command: process.execPath,
args: ["--import", "tsx", "src/mcp/plugin-tools-serve.ts"],
stderr: "pipe",
env: transportEnv,
});
const client = new Client({ name: "openclaw-qa-suite", version: "0.0.0" }, {});
try {
await client.connect(transport);
const listed = await client.listTools();
const tool = listed.tools.find((entry) => entry.name === params.toolName);
if (!tool) {
throw new Error(`MCP tool missing: ${params.toolName}`);
}
return await client.callTool({
name: params.toolName,
arguments: params.args,
});
} finally {
await client.close().catch(() => {});
}
}
async function runAgentPrompt(
env: QaSuiteEnvironment,
params: {
@ -285,7 +567,7 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
const message = await waitForOutboundMessage(
state,
(candidate) => candidate.conversation.id === "qa-room" && !candidate.threadId,
env.providerMode === "live-openai" ? 45_000 : 15_000,
env.providerMode === "live-openai" ? 45_000 : 45_000,
);
return message.text;
},
@ -706,6 +988,556 @@ function buildScenarioMap(env: QaSuiteEnvironment) {
},
]),
],
[
"memory-tools-channel-context",
async () =>
await runScenario("Memory tools in channel context", [
{
name: "uses memory_search plus memory_get before answering in-channel",
run: async () => {
await reset();
await fs.writeFile(
path.join(env.gateway.workspaceDir, "MEMORY.md"),
"Hidden QA fact: the project codename is ORBIT-9.\n",
"utf8",
);
await forceMemoryIndex({
env,
query: "project codename ORBIT-9",
expectedNeedle: "ORBIT-9",
});
const prompt =
"@openclaw Memory tools check: what is the hidden project codename stored only in memory? Use memory tools first.";
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
senderId: "alice",
senderName: "Alice",
text: prompt,
});
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-room" && candidate.text.includes("ORBIT-9"),
liveTurnTimeoutMs(env, 30_000),
);
if (env.mock) {
const requests = await fetchJson<
Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }>
>(`${env.mock.baseUrl}/debug/requests`);
const relevant = requests.filter((request) =>
String(request.allInputText ?? "").includes("Memory tools check"),
);
if (!relevant.some((request) => request.plannedToolName === "memory_search")) {
throw new Error("expected memory_search in mock request plan");
}
if (!requests.some((request) => request.plannedToolName === "memory_get")) {
throw new Error("expected memory_get in mock request plan");
}
}
return outbound.text;
},
},
]),
],
[
"memory-failure-fallback",
async () =>
await runScenario("Memory failure fallback", [
{
name: "falls back cleanly when group:memory tools are denied",
run: async () => {
const original = await readConfigSnapshot(env);
await fs.writeFile(
path.join(env.gateway.workspaceDir, "MEMORY.md"),
"Do not reveal directly: fallback fact is ORBIT-9.\n",
"utf8",
);
await patchConfig({
env,
patch: { tools: { deny: ["group:memory"] } },
});
await waitForGatewayHealthy(env);
try {
const sessionKey = await createSession(env, "Memory fallback");
const tools = await readEffectiveTools(env, sessionKey);
if (tools.has("memory_search") || tools.has("memory_get")) {
throw new Error("memory tools still present after deny patch");
}
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:memory-failure",
message:
"Memory unavailable check: a hidden fact exists only in memory files. If you cannot confirm it, say so clearly and do not guess.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) => candidate.conversation.id === "qa-operator",
liveTurnTimeoutMs(env, 30_000),
);
const lower = outbound.text.toLowerCase();
if (outbound.text.includes("ORBIT-9")) {
throw new Error(`hallucinated hidden fact: ${outbound.text}`);
}
if (!lower.includes("could not confirm") && !lower.includes("will not guess")) {
throw new Error(`missing graceful fallback language: ${outbound.text}`);
}
return outbound.text;
} finally {
await applyConfig({
env,
nextConfig: original.config,
});
await waitForGatewayHealthy(env);
}
},
},
]),
],
[
"model-switch-tool-continuity",
async () =>
await runScenario("Model switch with tool continuity", [
{
name: "keeps using tools after switching models",
run: async () => {
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:model-switch-tools",
message:
"Read QA_KICKOFF_TASK.md and summarize the QA mission in one clause before any model switch.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const alternate = splitModelRef(env.alternateModel);
const beforeSwitchCursor = state.getSnapshot().messages.length;
await runAgentPrompt(env, {
sessionKey: "agent:qa:model-switch-tools",
message:
"Switch models now. Tool continuity check: reread QA_KICKOFF_TASK.md and mention the handoff in one short sentence.",
provider: alternate?.provider,
model: alternate?.model,
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForCondition(
() => {
const snapshot = state.getSnapshot();
return snapshot.messages
.slice(beforeSwitchCursor)
.filter(
(candidate) =>
candidate.direction === "outbound" &&
candidate.conversation.id === "qa-operator" &&
(candidate.text.toLowerCase().includes("model switch") ||
candidate.text.toLowerCase().includes("handoff")),
)
.at(-1);
},
liveTurnTimeoutMs(env, 30_000),
);
if (env.mock) {
const requests = await fetchJson<
Array<{ allInputText?: string; plannedToolName?: string; model?: string }>
>(`${env.mock.baseUrl}/debug/requests`);
const switched = requests.find((request) =>
String(request.allInputText ?? "").includes("Tool continuity check"),
);
if (switched?.plannedToolName !== "read") {
throw new Error(
`expected read after switch, got ${String(switched?.plannedToolName ?? "")}`,
);
}
if (switched?.model !== "gpt-5.4-alt") {
throw new Error(`expected alternate model, got ${String(switched?.model ?? "")}`);
}
}
return outbound.text;
},
},
]),
],
[
"mcp-plugin-tools-call",
async () =>
await runScenario("MCP plugin-tools call", [
{
name: "serves and calls memory_search over MCP",
run: async () => {
await fs.writeFile(
path.join(env.gateway.workspaceDir, "MEMORY.md"),
"MCP fact: the codename is ORBIT-9.\n",
"utf8",
);
await forceMemoryIndex({
env,
query: "ORBIT-9 codename",
expectedNeedle: "ORBIT-9",
});
const result = await callPluginToolsMcp({
env,
toolName: "memory_search",
args: {
query: "ORBIT-9 codename",
maxResults: 3,
},
});
const text = JSON.stringify(result.content ?? []);
if (!text.includes("ORBIT-9")) {
throw new Error(`MCP memory_search missed expected fact: ${text}`);
}
return text;
},
},
]),
],
[
"skill-visibility-invocation",
async () =>
await runScenario("Skill visibility and invocation", [
{
name: "reports visible skill and applies its marker on the next turn",
run: async () => {
await writeWorkspaceSkill({
env,
name: "qa-visible-skill",
body: `---
name: qa-visible-skill
description: Visible QA skill marker
---
When the user asks for the visible skill marker exactly, reply with exactly: VISIBLE-SKILL-OK`,
});
const skills = await readSkillStatus(env);
const visible = findSkill(skills, "qa-visible-skill");
if (!visible?.eligible || visible.disabled || visible.blockedByAllowlist) {
throw new Error(`skill not visible/eligible: ${JSON.stringify(visible)}`);
}
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:visible-skill",
message: "Visible skill marker: give me the visible skill marker exactly.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-operator" &&
candidate.text.includes("VISIBLE-SKILL-OK"),
liveTurnTimeoutMs(env, 20_000),
);
return outbound.text;
},
},
]),
],
[
"skill-install-hot-availability",
async () =>
await runScenario("Skill install hot availability", [
{
name: "picks up a newly added workspace skill without restart",
run: async () => {
const before = await readSkillStatus(env);
if (findSkill(before, "qa-hot-install-skill")) {
throw new Error("qa-hot-install-skill unexpectedly already present");
}
await writeWorkspaceSkill({
env,
name: "qa-hot-install-skill",
body: `---
name: qa-hot-install-skill
description: Hot install QA marker
---
When the user asks for the hot install marker exactly, reply with exactly: HOT-INSTALL-OK`,
});
await waitForCondition(
async () => {
const skills = await readSkillStatus(env);
return findSkill(skills, "qa-hot-install-skill")?.eligible ? true : undefined;
},
15_000,
200,
);
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:hot-skill",
message: "Hot install marker: give me the hot install marker exactly.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-operator" &&
candidate.text.includes("HOT-INSTALL-OK"),
liveTurnTimeoutMs(env, 20_000),
);
return outbound.text;
},
},
]),
],
[
"native-image-generation",
async () =>
await runScenario("Native image generation", [
{
name: "enables image_generate and saves a real media artifact",
run: async () => {
const imageModelRef =
env.providerMode === "live-openai"
? "openai/gpt-image-1"
: "mock-openai/gpt-image-1";
await patchConfig({
env,
patch: {
agents: {
defaults: {
imageGenerationModel: {
primary: imageModelRef,
},
},
},
},
});
await waitForGatewayHealthy(env);
const sessionKey = await createSession(env, "Image generation");
const tools = await readEffectiveTools(env, sessionKey);
if (!tools.has("image_generate")) {
throw new Error("image_generate not present after imageGenerationModel patch");
}
await reset();
await runAgentPrompt(env, {
sessionKey: "agent:qa:image-generate",
message:
"Image generation check: generate a QA lighthouse image and summarize it in one short sentence.",
timeoutMs: liveTurnTimeoutMs(env, 45_000),
});
const outbound = await waitForOutboundMessage(
state,
(candidate) => candidate.conversation.id === "qa-operator",
liveTurnTimeoutMs(env, 45_000),
);
if (env.mock) {
const requests = await fetchJson<
Array<{ allInputText?: string; plannedToolName?: string; toolOutput?: string }>
>(`${env.mock.baseUrl}/debug/requests`);
const imageRequest = requests.find((request) =>
String(request.allInputText ?? "").includes("Image generation check"),
);
if (imageRequest?.plannedToolName !== "image_generate") {
throw new Error(
`expected image_generate, got ${String(imageRequest?.plannedToolName ?? "")}`,
);
}
const toolOutputRequest = requests.find((request) =>
String(request.toolOutput ?? "").includes(
`Generated 1 image with ${imageModelRef}.`,
),
);
if (!toolOutputRequest) {
throw new Error("missing mock image generation tool output");
}
const mediaPath = /MEDIA:([^\n]+)/.exec(outbound.text)?.[1]?.trim();
if (!mediaPath) {
throw new Error("missing MEDIA path in image generation tool output");
}
await fs.access(mediaPath);
}
return outbound.text;
},
},
]),
],
[
"config-patch-hot-apply",
async () =>
await runScenario("Config patch hot apply", [
{
name: "updates mention routing without restart",
run: async () => {
const original = await readConfigSnapshot(env);
await patchConfig({
env,
patch: {
messages: {
groupChat: {
mentionPatterns: ["\\bgoldenbot\\b"],
},
},
},
});
await waitForGatewayHealthy(env);
try {
await reset();
const requestsBeforeIgnored = env.mock
? await fetchJson<Array<{ allInputText?: string }>>(
`${env.mock.baseUrl}/debug/requests`,
)
: null;
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
senderId: "alice",
senderName: "Alice",
text: "@openclaw you should now be ignored",
});
await waitForCondition(
async () => {
if (!env.mock) {
return (await waitForNoOutbound(state), true);
}
const requests = await fetchJson<Array<{ allInputText?: string }>>(
`${env.mock.baseUrl}/debug/requests`,
);
const ignoredPromptReachedAgent = requests.some((request) =>
String(request.allInputText ?? "").includes(
"@openclaw you should now be ignored",
),
);
if (ignoredPromptReachedAgent) {
throw new Error("ignored channel mention still reached the agent");
}
return requests.length === requestsBeforeIgnored?.length ? true : undefined;
},
3_000,
100,
);
state.addInboundMessage({
conversation: { id: "qa-room", kind: "channel", title: "QA Room" },
senderId: "alice",
senderName: "Alice",
text: "goldenbot explain hot config apply",
});
const outbound = await waitForOutboundMessage(
state,
(candidate) => candidate.conversation.id === "qa-room",
liveTurnTimeoutMs(env, 30_000),
);
if (env.mock) {
const requests = await fetchJson<Array<{ allInputText?: string }>>(
`${env.mock.baseUrl}/debug/requests`,
);
if (
!requests.some((request) =>
String(request.allInputText ?? "").includes(
"goldenbot explain hot config apply",
),
)
) {
throw new Error(
"goldenbot follow-up did not reach the agent after config patch",
);
}
}
return outbound.text;
} finally {
await applyConfig({
env,
nextConfig: original.config,
});
await waitForGatewayHealthy(env);
}
},
},
]),
],
[
"config-apply-restart-wakeup",
async () =>
await runScenario("Config apply restart wake-up", [
{
name: "restarts cleanly and posts the restart sentinel back into qa-channel",
run: async () => {
await reset();
const sessionKey = "agent:qa:restart-wakeup";
await runAgentPrompt(env, {
sessionKey,
to: "channel:qa-room",
message: "Acknowledge restart wake-up setup in qa-room.",
timeoutMs: liveTurnTimeoutMs(env, 30_000),
});
const current = await readConfigSnapshot(env);
const nextConfig = structuredClone(current.config);
const gatewayConfig = (nextConfig.gateway ??= {}) as Record<string, unknown>;
const controlUi = (gatewayConfig.controlUi ??= {}) as Record<string, unknown>;
const allowedOrigins = Array.isArray(controlUi.allowedOrigins)
? [...(controlUi.allowedOrigins as string[])]
: [];
const wakeMarker = `QA-RESTART-${randomUUID().slice(0, 8)}`;
if (!allowedOrigins.includes("http://127.0.0.1:65535")) {
allowedOrigins.push("http://127.0.0.1:65535");
}
controlUi.allowedOrigins = allowedOrigins;
await applyConfig({
env,
nextConfig,
sessionKey,
note: wakeMarker,
});
await waitForGatewayHealthy(env, 60_000);
const outbound = await waitForOutboundMessage(
state,
(candidate) =>
candidate.conversation.id === "qa-room" && candidate.text.includes(wakeMarker),
60_000,
);
return outbound.text;
},
},
]),
],
[
"runtime-inventory-drift-check",
async () =>
await runScenario("Runtime inventory drift check", [
{
name: "keeps tools.effective and skills.status aligned after config changes",
run: async () => {
await writeWorkspaceSkill({
env,
name: "qa-drift-skill",
body: `---
name: qa-drift-skill
description: Drift skill marker
---
When the user asks for the drift skill marker exactly, reply with exactly: DRIFT-SKILL-OK`,
});
const sessionKey = await createSession(env, "Inventory drift");
const beforeTools = await readEffectiveTools(env, sessionKey);
if (!beforeTools.has("image_generate")) {
throw new Error("expected image_generate before drift patch");
}
const beforeSkills = await readSkillStatus(env);
if (!findSkill(beforeSkills, "qa-drift-skill")?.eligible) {
throw new Error("expected qa-drift-skill to be eligible before patch");
}
await patchConfig({
env,
patch: {
tools: {
deny: ["image_generate"],
},
skills: {
entries: {
"qa-drift-skill": {
enabled: false,
},
},
},
},
});
await waitForGatewayHealthy(env);
const afterTools = await readEffectiveTools(env, sessionKey);
if (afterTools.has("image_generate")) {
throw new Error("image_generate still present after deny patch");
}
const afterSkills = await readSkillStatus(env);
const driftSkill = findSkill(afterSkills, "qa-drift-skill");
if (!driftSkill?.disabled) {
throw new Error(`expected disabled drift skill, got ${JSON.stringify(driftSkill)}`);
}
return `image_generate removed, qa-drift-skill disabled=${String(driftSkill.disabled)}`;
},
},
]),
],
]);
}
@ -715,6 +1547,7 @@ export async function runQaSuite(params?: {
primaryModel?: string;
alternateModel?: string;
fastMode?: boolean;
scenarioIds?: string[];
}) {
const startedAt = new Date();
const providerMode = params?.providerMode ?? "mock-openai";
@ -768,13 +1601,28 @@ export async function runQaSuite(params?: {
try {
const catalog = readQaBootstrapScenarioCatalog();
const requestedScenarioIds = params?.scenarioIds ? new Set(params.scenarioIds) : null;
const selectedCatalogScenarios = requestedScenarioIds
? catalog.scenarios.filter((scenario) => requestedScenarioIds.has(scenario.id))
: catalog.scenarios;
if (requestedScenarioIds) {
const foundScenarioIds = new Set(selectedCatalogScenarios.map((scenario) => scenario.id));
const missingScenarioIds = [...requestedScenarioIds].filter(
(scenarioId) => !foundScenarioIds.has(scenarioId),
);
if (missingScenarioIds.length > 0) {
throw new Error(`unknown QA scenario id(s): ${missingScenarioIds.join(", ")}`);
}
}
const scenarioMap = buildScenarioMap(env);
const scenarios: QaSuiteScenarioResult[] = [];
const liveScenarioOutcomes: QaLabScenarioOutcome[] = catalog.scenarios.map((scenario) => ({
id: scenario.id,
name: scenario.title,
status: "pending",
}));
const liveScenarioOutcomes: QaLabScenarioOutcome[] = selectedCatalogScenarios.map(
(scenario) => ({
id: scenario.id,
name: scenario.title,
status: "pending",
}),
);
lab.setScenarioRun({
kind: "suite",
@ -783,7 +1631,7 @@ export async function runQaSuite(params?: {
scenarios: liveScenarioOutcomes,
});
for (const [index, scenario] of catalog.scenarios.entries()) {
for (const [index, scenario] of selectedCatalogScenarios.entries()) {
const run = scenarioMap.get(scenario.id);
if (!run) {
const missingResult = {

150
qa/new-scenarios-2026-04.md Normal file
View File

@ -0,0 +1,150 @@
# QA Scenario Expansion - Round 2
Ten repo-grounded candidate scenarios to add after the current seed suite.
## 1. On-demand memory tools in channel context
- Goal: verify the agent uses `memory_search` plus `memory_get` instead of bluffing when a channel message asks about prior notes.
- Flow:
- Seed `MEMORY.md` or `memory/*.md` with a fact not present in the current transcript.
- Ask in a channel thread for that fact.
- Verify tool usage and final answer accuracy.
- Pass:
- `memory_search` runs first.
- `memory_get` narrows to the right lines.
- Final answer cites the remembered fact correctly without cross-session leakage.
- Docs: `docs/concepts/memory.md`, `docs/concepts/memory-search.md`
- Code: `extensions/memory-core/src/tools.ts`, `extensions/memory-core/src/prompt-section.ts`
## 2. Memory failure fallback
- Goal: verify memory failure is graceful when embeddings/search are unavailable.
- Flow:
- Disable or break the embedding-backed memory path.
- Ask for prior-note recall.
- Verify the agent surfaces uncertainty and next action instead of hallucinating.
- Pass:
- Tool failure does not crash the run.
- Agent says it checked and could not confirm.
- Report includes the remediation hint.
- Docs: `docs/concepts/memory.md`, `docs/help/faq.md`
- Code: `extensions/memory-core/src/tools.shared.ts`, `extensions/memory-core/src/tools.citations.test.ts`
## 3. Model switch with tool continuity
- Goal: verify model switching preserves session context and tool availability, not just plain text continuity.
- Flow:
- Start on one model.
- Switch to another configured model.
- Ask for a tool-using follow-up such as file read or memory lookup.
- Pass:
- Switch is reflected in runtime state.
- Tool call still succeeds after the switch.
- Final answer keeps prior context.
- Docs: `docs/help/testing.md`, `docs/concepts/model-failover.md`
- Code: `extensions/qa-lab/src/suite.ts`, `docs/web/webchat.md`
## 4. MCP-backed recall via QMD/mcporter
- Goal: verify an MCP-backed tool path works end to end, not just core tools.
- Flow:
- Enable `memory.qmd.mcporter`.
- Ask for recall that should route through the QMD MCP bridge.
- Verify response and captured MCP execution path.
- Pass:
- MCP-backed search path is used.
- Returned snippet matches the right note.
- Failure mode is explicit if the daemon/tool is missing.
- Docs: `docs/gateway/secrets.md`, `docs/concepts/memory-qmd.md`
- Code: `extensions/memory-core/src/memory/qmd-manager.ts`, `extensions/memory-core/src/memory/qmd-manager.test.ts`
## 5. Skill visibility and invocation
- Goal: verify the agent sees a workspace/project skill and actually uses it.
- Flow:
- Add a simple workspace or `.agents` skill.
- Confirm skill visibility through runtime inventory.
- Ask for a task that should trigger the skill.
- Pass:
- Skill appears in `skills.status`.
- Agent invocation reflects the installed skill instructions.
- Per-agent allowlist behavior is respected.
- Docs: `docs/tools/skills.md`, `docs/gateway/protocol.md`, `docs/gateway/configuration.md`
- Code: `.agents/skills/openclaw-qa-testing/SKILL.md`, `docs/gateway/protocol.md`
## 6. Skill install and hot availability
- Goal: verify a newly installed skill becomes usable without a broken intermediate state.
- Flow:
- Install a ClawHub or gateway-managed skill.
- Re-check skill inventory.
- Ask the agent to perform the skill-backed task.
- Pass:
- Install succeeds.
- `skills.status` or `skills.bins` reflects the new skill.
- Agent can use the skill immediately or after the expected reload path.
- Docs: `docs/tools/skills.md`, `docs/cli/skills.md`, `docs/gateway/protocol.md`
- Code: `docs/gateway/protocol.md`, `docs/tools/skills.md`
## 7. Native image generation
- Goal: verify `image_generate` appears only when configured and returns a real attachment/artifact.
- Flow:
- Configure `agents.defaults.imageGenerationModel.primary`.
- Ask for a simple generated image.
- Verify generated media is returned in the reply path.
- Pass:
- `image_generate` is in the effective tool set.
- Generation succeeds with the configured provider/model.
- Output is attached and the agent summarizes what it created.
- Docs: `docs/tools/image-generation.md`, `docs/providers/openai.md`
- Code: `src/agents/openclaw-tools.image-generation.test.ts`, `src/image-generation/runtime.ts`
## 8. Hot config patch without restart
- Goal: verify a safe config edit hot-applies and changes behavior immediately.
- Flow:
- Use `config.patch` to change a hot-reloadable field such as agent skill visibility or message behavior.
- Retry the task in the same gateway lifetime.
- Pass:
- Patch succeeds.
- No disruptive restart loop.
- New behavior is live immediately.
- Docs: `docs/gateway/configuration.md`, `docs/gateway/protocol.md`
- Code: `docs/gateway/configuration.md`, `docs/web/control-ui.md`
## 9. Restart-required config apply with wake-up
- Goal: verify a restart-required config change restarts cleanly and wakes the session back up.
- Flow:
- Use `config.apply` or `update.run` on a restart-required surface.
- Provide `sessionKey` so the operator gets the post-restart ping.
- Resume the task after restart.
- Pass:
- Restart happens once.
- Session wake-up ping arrives.
- Agent continues in the same logical workflow after restart.
- Docs: `docs/gateway/configuration.md`, `docs/web/control-ui.md`
- Code: `docs/gateway/configuration.md`, `docs/gateway/protocol.md`
## 10. Runtime inventory drift check
- Goal: verify the reported tool and skill inventory matches what the agent can really use after config/plugin changes.
- Flow:
- Read `tools.effective` and `skills.status`.
- Ask the agent to use one enabled thing and one disabled thing.
- Compare actual behavior vs reported inventory.
- Pass:
- Enabled item is callable.
- Disabled item is absent or blocked for the right reason.
- Inventory and runtime behavior stay in sync.
- Docs: `docs/gateway/protocol.md`, `docs/web/webchat.md`
- Code: `docs/gateway/protocol.md`, `docs/web/control-ui.md`
## Best next additions to the executable suite
If we only promote three right away:
1. On-demand memory tools in channel context
2. Native image generation
3. Hot config patch without restart

View File

@ -135,5 +135,141 @@
],
"docsRefs": ["docs/channels/qa-channel.md", "docs/channels/group-messages.md"],
"codeRefs": ["extensions/qa-channel/src/protocol.ts", "extensions/qa-lab/src/bus-state.ts"]
},
{
"id": "memory-tools-channel-context",
"title": "Memory tools in channel context",
"surface": "memory",
"objective": "Verify the agent uses memory_search and memory_get in a shared channel when the answer lives only in memory files, not the live transcript.",
"successCriteria": [
"Agent uses memory_search before answering.",
"Agent narrows with memory_get before answering.",
"Final reply returns the memory-only fact correctly in-channel."
],
"docsRefs": ["docs/concepts/memory.md", "docs/concepts/memory-search.md"],
"codeRefs": ["extensions/memory-core/src/tools.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "memory-failure-fallback",
"title": "Memory failure fallback",
"surface": "memory",
"objective": "Verify the agent degrades gracefully when memory tools are unavailable and the answer exists only in memory-backed notes.",
"successCriteria": [
"Memory tools are absent from the effective tool inventory.",
"Agent does not hallucinate the hidden fact.",
"Agent says it could not confirm and surfaces the limitation."
],
"docsRefs": ["docs/concepts/memory.md", "docs/tools/index.md"],
"codeRefs": ["extensions/memory-core/src/tools.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "model-switch-tool-continuity",
"title": "Model switch with tool continuity",
"surface": "models",
"objective": "Verify switching models preserves session context and tool use instead of dropping into plain-text only behavior.",
"successCriteria": [
"Alternate model is actually requested.",
"A tool call still happens after the model switch.",
"Final answer acknowledges the handoff and uses the tool-derived evidence."
],
"docsRefs": ["docs/help/testing.md", "docs/concepts/model-failover.md"],
"codeRefs": ["extensions/qa-lab/src/suite.ts", "extensions/qa-lab/src/mock-openai-server.ts"]
},
{
"id": "mcp-plugin-tools-call",
"title": "MCP plugin-tools call",
"surface": "mcp",
"objective": "Verify OpenClaw can expose plugin tools over MCP and a real MCP client can call one successfully.",
"successCriteria": [
"Plugin tools MCP server lists memory_search.",
"A real MCP client calls memory_search successfully.",
"The returned MCP payload includes the expected memory-only fact."
],
"docsRefs": ["docs/cli/mcp.md", "docs/gateway/protocol.md"],
"codeRefs": ["src/mcp/plugin-tools-serve.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "skill-visibility-invocation",
"title": "Skill visibility and invocation",
"surface": "skills",
"objective": "Verify a workspace skill becomes visible in skills.status and influences the next agent turn.",
"successCriteria": [
"skills.status reports the seeded skill as visible and eligible.",
"The next agent turn reflects the skill instruction marker.",
"The result stays scoped to the active QA workspace skill."
],
"docsRefs": ["docs/tools/skills.md", "docs/gateway/protocol.md"],
"codeRefs": ["src/agents/skills-status.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "skill-install-hot-availability",
"title": "Skill install hot availability",
"surface": "skills",
"objective": "Verify a newly added workspace skill shows up without a broken intermediate state and can influence the next turn immediately.",
"successCriteria": [
"Skill is absent before install.",
"skills.status reports it after install without a restart.",
"The next agent turn reflects the new skill marker."
],
"docsRefs": ["docs/tools/skills.md", "docs/gateway/configuration.md"],
"codeRefs": ["src/agents/skills-status.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "native-image-generation",
"title": "Native image generation",
"surface": "image-generation",
"objective": "Verify image_generate appears when configured and returns a real saved media artifact.",
"successCriteria": [
"image_generate appears in the effective tool inventory.",
"Agent triggers native image_generate.",
"Tool output returns a saved MEDIA path and the file exists."
],
"docsRefs": ["docs/tools/image-generation.md", "docs/providers/openai.md"],
"codeRefs": [
"src/agents/tools/image-generate-tool.ts",
"extensions/qa-lab/src/mock-openai-server.ts"
]
},
{
"id": "config-patch-hot-apply",
"title": "Config patch hot apply",
"surface": "config",
"objective": "Verify a hot-reloadable config.patch takes effect immediately without a disruptive restart.",
"successCriteria": [
"config.patch succeeds with no restart dependency.",
"Old mention routing behavior stops working immediately.",
"New mention routing behavior works in the same gateway lifetime."
],
"docsRefs": ["docs/gateway/configuration.md", "docs/gateway/protocol.md"],
"codeRefs": ["src/gateway/server-methods/config.ts", "extensions/qa-lab/src/suite.ts"]
},
{
"id": "config-apply-restart-wakeup",
"title": "Config apply restart wake-up",
"surface": "config",
"objective": "Verify a restart-required config.apply restarts cleanly and delivers the post-restart wake message back into the QA channel.",
"successCriteria": [
"config.apply schedules a restart-required change.",
"Gateway becomes healthy again after restart.",
"Restart sentinel wake-up message arrives in the QA channel."
],
"docsRefs": ["docs/gateway/configuration.md", "docs/gateway/protocol.md"],
"codeRefs": ["src/gateway/server-methods/config.ts", "src/gateway/server-restart-sentinel.ts"]
},
{
"id": "runtime-inventory-drift-check",
"title": "Runtime inventory drift check",
"surface": "inventory",
"objective": "Verify tools.effective and skills.status stay aligned with runtime behavior after config changes.",
"successCriteria": [
"Enabled tool appears before the config change.",
"After config change, disabled tool disappears from tools.effective.",
"Disabled skill appears in skills.status with disabled state."
],
"docsRefs": ["docs/gateway/protocol.md", "docs/tools/skills.md", "docs/tools/index.md"],
"codeRefs": [
"src/gateway/server-methods/tools-effective.ts",
"src/gateway/server-methods/skills.ts"
]
}
]