import fs from "node:fs/promises"; import path from "node:path"; import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; import type { MsgContext } from "../auto-reply/templating.js"; import type { OpenClawConfig } from "../config/config.js"; import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js"; // --------------------------------------------------------------------------- // Module mocks // --------------------------------------------------------------------------- vi.mock("../agents/model-auth.js", () => ({ resolveApiKeyForProvider: vi.fn(async () => ({ apiKey: "test-key", // pragma: allowlist secret source: "test", mode: "api-key", })), requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => { if (auth?.apiKey) { return auth.apiKey; } throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`); }, resolveAwsSdkEnvVarName: vi.fn(() => undefined), resolveEnvApiKey: vi.fn(() => null), resolveModelAuthMode: vi.fn(() => "api-key"), getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })), getCustomProviderApiKey: vi.fn(() => undefined), ensureAuthProfileStore: vi.fn(async () => ({})), resolveAuthProfileOrder: vi.fn(() => []), })); const { MediaFetchErrorMock } = vi.hoisted(() => { class MediaFetchErrorMock extends Error { code: string; constructor(message: string, code: string) { super(message); this.name = "MediaFetchError"; this.code = code; } } return { MediaFetchErrorMock }; }); vi.mock("../media/fetch.js", () => ({ fetchRemoteMedia: vi.fn(), MediaFetchError: MediaFetchErrorMock, })); vi.mock("../process/exec.js", () => ({ runExec: vi.fn(), runCommandWithTimeout: vi.fn(), })); const mockDeliverOutboundPayloads = vi.fn(); vi.mock("../infra/outbound/deliver.js", () => ({ deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args), })); // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding; let clearMediaUnderstandingBinaryCacheForTests: () => void; const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-"; let suiteTempMediaRootDir = ""; async function createTempAudioFile(): Promise { const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-")); const filePath = path.join(dir, "note.ogg"); await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048)); return filePath; } function createAudioCtxWithProvider(mediaPath: string, extra?: Partial): MsgContext { return { Body: "", MediaPath: mediaPath, MediaType: "audio/ogg", Provider: "whatsapp", From: "+10000000001", AccountId: "acc1", ...extra, }; } function createAudioConfigWithEcho(opts?: { echoTranscript?: boolean; echoFormat?: string; transcribedText?: string; }): { cfg: OpenClawConfig; providers: Record Promise<{ text: string }> }>; } { const cfg: OpenClawConfig = { tools: { media: { audio: { enabled: true, maxBytes: 1024 * 1024, models: [{ provider: "groq" }], echoTranscript: opts?.echoTranscript ?? true, ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}), }, }, }, }; const providers = { groq: { id: "groq", transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }), }, }; return { cfg, providers }; } function expectSingleEchoDeliveryCall() { expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0]; expect(callArgs).toBeDefined(); return callArgs as { to?: string; channel?: string; accountId?: string; payloads: Array<{ text?: string }>; }; } function createAudioConfigWithoutEchoFlag() { const { cfg, providers } = createAudioConfigWithEcho(); const audio = cfg.tools?.media?.audio as { echoTranscript?: boolean } | undefined; if (audio && "echoTranscript" in audio) { delete audio.echoTranscript; } return { cfg, providers }; } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- describe("applyMediaUnderstanding – echo transcript", () => { beforeAll(async () => { const baseDir = resolvePreferredOpenClawTmpDir(); await fs.mkdir(baseDir, { recursive: true }); suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX)); const mod = await import("./apply.js"); applyMediaUnderstanding = mod.applyMediaUnderstanding; const runner = await import("./runner.js"); clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests; }); beforeEach(() => { mockDeliverOutboundPayloads.mockClear(); mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]); clearMediaUnderstandingBinaryCacheForTests?.(); }); afterAll(async () => { if (!suiteTempMediaRootDir) { return; } await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true }); suiteTempMediaRootDir = ""; }); it("does NOT echo when echoTranscript is false (default)", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false }); await applyMediaUnderstanding({ ctx, cfg, providers }); expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("does NOT echo when echoTranscript is absent (default)", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithoutEchoFlag(); await applyMediaUnderstanding({ ctx, cfg, providers }); expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("echoes transcript with default format when echoTranscript is true", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true, transcribedText: "hello world", }); await applyMediaUnderstanding({ ctx, cfg, providers }); const callArgs = expectSingleEchoDeliveryCall(); expect(callArgs.channel).toBe("whatsapp"); expect(callArgs.to).toBe("+10000000001"); expect(callArgs.accountId).toBe("acc1"); expect(callArgs.payloads).toHaveLength(1); expect(callArgs.payloads[0].text).toBe('📝 "hello world"'); }); it("uses custom echoFormat when provided", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true, echoFormat: "🎙️ Heard: {transcript}", transcribedText: "custom message", }); await applyMediaUnderstanding({ ctx, cfg, providers }); const callArgs = expectSingleEchoDeliveryCall(); expect(callArgs.payloads[0].text).toBe("🎙️ Heard: custom message"); }); it("does NOT echo when there are no audio attachments", async () => { // Image-only context — no audio attachment const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-")); const imgPath = path.join(dir, "photo.jpg"); await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0])); const ctx: MsgContext = { Body: "", MediaPath: imgPath, MediaType: "image/jpeg", Provider: "whatsapp", From: "+10000000001", }; const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true, transcribedText: "should not appear", }); cfg.tools!.media!.image = { enabled: false }; await applyMediaUnderstanding({ ctx, cfg, providers }); // No audio outputs → Transcript not set → no echo expect(ctx.Transcript).toBeUndefined(); expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("does NOT echo when transcription fails", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); providers.groq.transcribeAudio = async () => { throw new Error("transcription provider failure"); }; // Should not throw; transcription failure is swallowed by runner await applyMediaUnderstanding({ ctx, cfg, providers }); expect(ctx.Transcript).toBeUndefined(); expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("does NOT echo when channel is not deliverable", async () => { const mediaPath = await createTempAudioFile(); // Use an internal/non-deliverable channel const ctx = createAudioCtxWithProvider(mediaPath, { Provider: "internal-system", From: "some-source", }); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); await applyMediaUnderstanding({ ctx, cfg, providers }); // Transcript should be set (transcription succeeded) expect(ctx.Transcript).toBe("hello world"); // But echo should be skipped expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("does NOT echo when ctx has no From or OriginatingTo", async () => { const mediaPath = await createTempAudioFile(); const ctx: MsgContext = { Body: "", MediaPath: mediaPath, MediaType: "audio/ogg", Provider: "whatsapp", // From and OriginatingTo intentionally absent }; const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); await applyMediaUnderstanding({ ctx, cfg, providers }); expect(ctx.Transcript).toBe("hello world"); expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); }); it("uses OriginatingTo when From is absent", async () => { const mediaPath = await createTempAudioFile(); const ctx: MsgContext = { Body: "", MediaPath: mediaPath, MediaType: "audio/ogg", Provider: "whatsapp", OriginatingTo: "+19999999999", }; const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); await applyMediaUnderstanding({ ctx, cfg, providers }); const callArgs = expectSingleEchoDeliveryCall(); expect(callArgs.to).toBe("+19999999999"); }); it("echo delivery failure does not throw or break transcription", async () => { const mediaPath = await createTempAudioFile(); const ctx = createAudioCtxWithProvider(mediaPath); const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout")); // Should not throw const result = await applyMediaUnderstanding({ ctx, cfg, providers }); // Transcription itself succeeded expect(result.appliedAudio).toBe(true); expect(ctx.Transcript).toBe("hello world"); // Deliver was attempted expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); }); });