mirror of https://github.com/openclaw/openclaw.git
334 lines
11 KiB
TypeScript
334 lines
11 KiB
TypeScript
import fs from "node:fs/promises";
|
||
import path from "node:path";
|
||
import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
||
import type { MsgContext } from "../auto-reply/templating.js";
|
||
import type { OpenClawConfig } from "../config/config.js";
|
||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||
import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js";
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Module mocks
|
||
// ---------------------------------------------------------------------------
|
||
|
||
vi.mock("../agents/model-auth.js", () => ({
|
||
resolveApiKeyForProvider: vi.fn(async () => ({
|
||
apiKey: "test-key", // pragma: allowlist secret
|
||
source: "test",
|
||
mode: "api-key",
|
||
})),
|
||
requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
|
||
if (auth?.apiKey) {
|
||
return auth.apiKey;
|
||
}
|
||
throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
|
||
},
|
||
resolveAwsSdkEnvVarName: vi.fn(() => undefined),
|
||
resolveEnvApiKey: vi.fn(() => null),
|
||
resolveModelAuthMode: vi.fn(() => "api-key"),
|
||
getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
|
||
getCustomProviderApiKey: vi.fn(() => undefined),
|
||
ensureAuthProfileStore: vi.fn(async () => ({})),
|
||
resolveAuthProfileOrder: vi.fn(() => []),
|
||
}));
|
||
|
||
const { MediaFetchErrorMock } = vi.hoisted(() => {
|
||
class MediaFetchErrorMock extends Error {
|
||
code: string;
|
||
constructor(message: string, code: string) {
|
||
super(message);
|
||
this.name = "MediaFetchError";
|
||
this.code = code;
|
||
}
|
||
}
|
||
return { MediaFetchErrorMock };
|
||
});
|
||
|
||
vi.mock("../media/fetch.js", () => ({
|
||
fetchRemoteMedia: vi.fn(),
|
||
MediaFetchError: MediaFetchErrorMock,
|
||
}));
|
||
|
||
vi.mock("../process/exec.js", () => ({
|
||
runExec: vi.fn(),
|
||
runCommandWithTimeout: vi.fn(),
|
||
}));
|
||
|
||
const mockDeliverOutboundPayloads = vi.fn();
|
||
|
||
vi.mock("../infra/outbound/deliver.js", () => ({
|
||
deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
|
||
}));
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Helpers
|
||
// ---------------------------------------------------------------------------
|
||
|
||
let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
|
||
let clearMediaUnderstandingBinaryCacheForTests: () => void;
|
||
|
||
const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
|
||
let suiteTempMediaRootDir = "";
|
||
|
||
async function createTempAudioFile(): Promise<string> {
|
||
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
|
||
const filePath = path.join(dir, "note.ogg");
|
||
await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048));
|
||
return filePath;
|
||
}
|
||
|
||
function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
|
||
return {
|
||
Body: "<media:audio>",
|
||
MediaPath: mediaPath,
|
||
MediaType: "audio/ogg",
|
||
Provider: "whatsapp",
|
||
From: "+10000000001",
|
||
AccountId: "acc1",
|
||
...extra,
|
||
};
|
||
}
|
||
|
||
function createAudioConfigWithEcho(opts?: {
|
||
echoTranscript?: boolean;
|
||
echoFormat?: string;
|
||
transcribedText?: string;
|
||
}): {
|
||
cfg: OpenClawConfig;
|
||
providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
|
||
} {
|
||
const cfg: OpenClawConfig = {
|
||
tools: {
|
||
media: {
|
||
audio: {
|
||
enabled: true,
|
||
maxBytes: 1024 * 1024,
|
||
models: [{ provider: "groq" }],
|
||
echoTranscript: opts?.echoTranscript ?? true,
|
||
...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
|
||
},
|
||
},
|
||
},
|
||
};
|
||
const providers = {
|
||
groq: {
|
||
id: "groq",
|
||
transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
|
||
},
|
||
};
|
||
return { cfg, providers };
|
||
}
|
||
|
||
function expectSingleEchoDeliveryCall() {
|
||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
|
||
expect(callArgs).toBeDefined();
|
||
return callArgs as {
|
||
to?: string;
|
||
channel?: string;
|
||
accountId?: string;
|
||
payloads: Array<{ text?: string }>;
|
||
};
|
||
}
|
||
|
||
function createAudioConfigWithoutEchoFlag() {
|
||
const { cfg, providers } = createAudioConfigWithEcho();
|
||
const audio = cfg.tools?.media?.audio as { echoTranscript?: boolean } | undefined;
|
||
if (audio && "echoTranscript" in audio) {
|
||
delete audio.echoTranscript;
|
||
}
|
||
return { cfg, providers };
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Tests
|
||
// ---------------------------------------------------------------------------
|
||
|
||
describe("applyMediaUnderstanding – echo transcript", () => {
|
||
beforeAll(async () => {
|
||
const baseDir = resolvePreferredOpenClawTmpDir();
|
||
await fs.mkdir(baseDir, { recursive: true });
|
||
suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
|
||
const mod = await import("./apply.js");
|
||
applyMediaUnderstanding = mod.applyMediaUnderstanding;
|
||
const runner = await import("./runner.js");
|
||
clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
|
||
});
|
||
|
||
beforeEach(() => {
|
||
mockDeliverOutboundPayloads.mockClear();
|
||
mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
|
||
clearMediaUnderstandingBinaryCacheForTests?.();
|
||
});
|
||
|
||
afterAll(async () => {
|
||
if (!suiteTempMediaRootDir) {
|
||
return;
|
||
}
|
||
await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
|
||
suiteTempMediaRootDir = "";
|
||
});
|
||
|
||
it("does NOT echo when echoTranscript is false (default)", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("does NOT echo when echoTranscript is absent (default)", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithoutEchoFlag();
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("echoes transcript with default format when echoTranscript is true", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithEcho({
|
||
echoTranscript: true,
|
||
transcribedText: "hello world",
|
||
});
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
const callArgs = expectSingleEchoDeliveryCall();
|
||
expect(callArgs.channel).toBe("whatsapp");
|
||
expect(callArgs.to).toBe("+10000000001");
|
||
expect(callArgs.accountId).toBe("acc1");
|
||
expect(callArgs.payloads).toHaveLength(1);
|
||
expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
|
||
});
|
||
|
||
it("uses custom echoFormat when provided", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithEcho({
|
||
echoTranscript: true,
|
||
echoFormat: "🎙️ Heard: {transcript}",
|
||
transcribedText: "custom message",
|
||
});
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
const callArgs = expectSingleEchoDeliveryCall();
|
||
expect(callArgs.payloads[0].text).toBe("🎙️ Heard: custom message");
|
||
});
|
||
|
||
it("does NOT echo when there are no audio attachments", async () => {
|
||
// Image-only context — no audio attachment
|
||
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
|
||
const imgPath = path.join(dir, "photo.jpg");
|
||
await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));
|
||
|
||
const ctx: MsgContext = {
|
||
Body: "<media:image>",
|
||
MediaPath: imgPath,
|
||
MediaType: "image/jpeg",
|
||
Provider: "whatsapp",
|
||
From: "+10000000001",
|
||
};
|
||
|
||
const { cfg, providers } = createAudioConfigWithEcho({
|
||
echoTranscript: true,
|
||
transcribedText: "should not appear",
|
||
});
|
||
cfg.tools!.media!.image = { enabled: false };
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
// No audio outputs → Transcript not set → no echo
|
||
expect(ctx.Transcript).toBeUndefined();
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("does NOT echo when transcription fails", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||
providers.groq.transcribeAudio = async () => {
|
||
throw new Error("transcription provider failure");
|
||
};
|
||
|
||
// Should not throw; transcription failure is swallowed by runner
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
expect(ctx.Transcript).toBeUndefined();
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("does NOT echo when channel is not deliverable", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
// Use an internal/non-deliverable channel
|
||
const ctx = createAudioCtxWithProvider(mediaPath, {
|
||
Provider: "internal-system",
|
||
From: "some-source",
|
||
});
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
// Transcript should be set (transcription succeeded)
|
||
expect(ctx.Transcript).toBe("hello world");
|
||
// But echo should be skipped
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("does NOT echo when ctx has no From or OriginatingTo", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx: MsgContext = {
|
||
Body: "<media:audio>",
|
||
MediaPath: mediaPath,
|
||
MediaType: "audio/ogg",
|
||
Provider: "whatsapp",
|
||
// From and OriginatingTo intentionally absent
|
||
};
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
expect(ctx.Transcript).toBe("hello world");
|
||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||
});
|
||
|
||
it("uses OriginatingTo when From is absent", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx: MsgContext = {
|
||
Body: "<media:audio>",
|
||
MediaPath: mediaPath,
|
||
MediaType: "audio/ogg",
|
||
Provider: "whatsapp",
|
||
OriginatingTo: "+19999999999",
|
||
};
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||
|
||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
const callArgs = expectSingleEchoDeliveryCall();
|
||
expect(callArgs.to).toBe("+19999999999");
|
||
});
|
||
|
||
it("echo delivery failure does not throw or break transcription", async () => {
|
||
const mediaPath = await createTempAudioFile();
|
||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||
|
||
mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));
|
||
|
||
// Should not throw
|
||
const result = await applyMediaUnderstanding({ ctx, cfg, providers });
|
||
|
||
// Transcription itself succeeded
|
||
expect(result.appliedAudio).toBe(true);
|
||
expect(ctx.Transcript).toBe("hello world");
|
||
// Deliver was attempted
|
||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||
});
|
||
});
|