From 4f2df617fee8f5bb292c1f0e83abb6d11658b8d2 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 31 Mar 2026 01:59:17 +0100 Subject: [PATCH] fix: handle Telegram audio auto-transcription --- CHANGELOG.md | 1 + src/media-understanding/apply.test.ts | 65 +++++++++++++++++++ .../audio-preflight.test.ts | 58 +++++++++++++++++ src/media-understanding/audio-preflight.ts | 2 +- src/media-understanding/runner.entries.ts | 40 +++++++++++- 5 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 src/media-understanding/audio-preflight.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index e0f8bd025d0..6abb12e7857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss. - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc. - Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc. +- Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset. ## 2026.3.28 diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index a6f206a3a7b..b87644e7b81 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -25,12 +25,14 @@ const hasAvailableAuthForProviderMock = vi.hoisted(() => }), ); const fetchRemoteMediaMock = vi.hoisted(() => vi.fn()); +const runFfmpegMock = vi.hoisted(() => vi.fn()); const runExecMock = vi.hoisted(() => vi.fn()); let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding; let clearMediaUnderstandingBinaryCacheForTests: typeof import("./runner.js").clearMediaUnderstandingBinaryCacheForTests; const mockedResolveApiKey = resolveApiKeyForProviderMock; const mockedFetchRemoteMedia = fetchRemoteMediaMock; +const mockedRunFfmpeg = runFfmpegMock; const mockedRunExec = runExecMock; const TEMP_MEDIA_PREFIX = "openclaw-media-"; @@ -255,6 +257,9 @@ describe("applyMediaUnderstanding", () => { vi.doMock("../media/fetch.js", () => ({ fetchRemoteMedia: fetchRemoteMediaMock, })); + vi.doMock("../media/ffmpeg-exec.js", () => ({ + runFfmpeg: runFfmpegMock, + })); vi.doMock("../process/exec.js", () => ({ runExec: runExecMock, })); @@ -304,6 +309,7 @@ describe("applyMediaUnderstanding", () => { }); hasAvailableAuthForProviderMock.mockClear(); mockedFetchRemoteMedia.mockClear(); + mockedRunFfmpeg.mockReset(); mockedRunExec.mockReset(); mockedFetchRemoteMedia.mockResolvedValue({ buffer: createSafeAudioFixtureBuffer(2048), @@ -703,6 +709,65 @@ describe("applyMediaUnderstanding", () => { ); }); + it("transcodes non-wav audio before auto-detected whisper-cli runs", async () => { + const binDir = await createTempMediaDir(); + const modelDir = await createTempMediaDir(); + await createMockExecutable(binDir, "whisper-cli"); + const modelPath = path.join(modelDir, "tiny.bin"); + await fs.writeFile(modelPath, "model"); + + const ctx = await createAudioCtx({ + fileName: "telegram-voice.ogg", + mediaType: "audio/ogg", + content: createSafeAudioFixtureBuffer(2048), + }); + const cfg: OpenClawConfig = { tools: { media: { audio: {} } } }; + + mockedRunFfmpeg.mockImplementationOnce(async (args: string[]) => { + const wavPath = args.at(-1); + if (typeof wavPath !== "string") { + throw new Error("missing wav path"); + } + await fs.writeFile(wavPath, Buffer.from("RIFF")); + return ""; + }); + mockedRunExec.mockResolvedValueOnce({ + stdout: "whisper cpp ogg ok\n", + stderr: "", + }); + + await withMediaAutoDetectEnv( + { + PATH: binDir, + WHISPER_CPP_MODEL: modelPath, + }, + async () => { + const result = await applyMediaUnderstanding({ ctx, cfg }); + expect(result.appliedAudio).toBe(true); + }, + ); + + expect(ctx.Transcript).toBe("whisper cpp ogg ok"); + expect(mockedRunFfmpeg).toHaveBeenCalledWith( + expect.arrayContaining([ + "-i", + expect.stringMatching(/telegram-voice\.ogg$/), + "-ac", + "1", + "-ar", + "16000", + "-c:a", + "pcm_s16le", + expect.stringMatching(/telegram-voice\.wav$/), + ]), + ); + expect(mockedRunExec).toHaveBeenCalledWith( + "whisper-cli", + expect.arrayContaining([expect.stringMatching(/telegram-voice\.wav$/)]), + expect.any(Object), + ); + }); + it("skips audio auto-detect when no supported binaries or provider keys are available", async () => { const emptyBinDir = await createTempMediaDir(); const isolatedAgentDir = await createTempMediaDir(); diff --git a/src/media-understanding/audio-preflight.test.ts b/src/media-understanding/audio-preflight.test.ts new file mode 100644 index 00000000000..32ceb85ddfc --- /dev/null +++ b/src/media-understanding/audio-preflight.test.ts @@ -0,0 +1,58 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const runAudioTranscriptionMock = vi.hoisted(() => vi.fn()); + +vi.mock("./audio-transcription-runner.js", () => ({ + runAudioTranscription: (...args: unknown[]) => runAudioTranscriptionMock(...args), +})); + +let transcribeFirstAudio: typeof import("./audio-preflight.js").transcribeFirstAudio; + +describe("transcribeFirstAudio", () => { + beforeEach(async () => { + vi.resetModules(); + runAudioTranscriptionMock.mockReset(); + ({ transcribeFirstAudio } = await import("./audio-preflight.js")); + }); + + it("runs audio preflight in auto mode when audio config is absent", async () => { + runAudioTranscriptionMock.mockResolvedValueOnce({ + transcript: "voice note transcript", + attachments: [], + }); + + const transcript = await transcribeFirstAudio({ + ctx: { + Body: "", + MediaPath: "/tmp/voice.ogg", + MediaType: "audio/ogg", + }, + cfg: {}, + }); + + expect(transcript).toBe("voice note transcript"); + expect(runAudioTranscriptionMock).toHaveBeenCalledTimes(1); + }); + + it("skips audio preflight when audio config is explicitly disabled", async () => { + const transcript = await transcribeFirstAudio({ + ctx: { + Body: "", + MediaPath: "/tmp/voice.ogg", + MediaType: "audio/ogg", + }, + cfg: { + tools: { + media: { + audio: { + enabled: false, + }, + }, + }, + }, + }); + + expect(transcript).toBeUndefined(); + expect(runAudioTranscriptionMock).not.toHaveBeenCalled(); + }); +}); diff --git a/src/media-understanding/audio-preflight.ts b/src/media-understanding/audio-preflight.ts index 735f921510c..19e0dd88626 100644 --- a/src/media-understanding/audio-preflight.ts +++ b/src/media-understanding/audio-preflight.ts @@ -26,7 +26,7 @@ export async function transcribeFirstAudio(params: { // Check if audio transcription is enabled in config const audioConfig = cfg.tools?.media?.audio; - if (!audioConfig || audioConfig.enabled === false) { + if (audioConfig?.enabled === false) { return undefined; } diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index fffe4aa1e0f..98c148e8dd4 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -15,6 +15,7 @@ import type { import { logVerbose, shouldLogVerbose } from "../globals.js"; import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js"; import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; +import { runFfmpeg } from "../media/ffmpeg-exec.js"; import { runExec } from "../process/exec.js"; import { MediaAttachmentCache } from "./attachments.js"; import { @@ -210,6 +211,38 @@ async function resolveCliOutput(params: { return params.stdout.trim(); } +async function resolveCliMediaPath(params: { + capability: MediaUnderstandingCapability; + command: string; + mediaPath: string; + outputDir: string; +}): Promise { + const commandId = commandBase(params.command); + if (params.capability !== "audio" || commandId !== "whisper-cli") { + return params.mediaPath; + } + + const ext = path.extname(params.mediaPath).toLowerCase(); + if (ext === ".wav") { + return params.mediaPath; + } + + const wavPath = path.join(params.outputDir, `${path.parse(params.mediaPath).name}.wav`); + await runFfmpeg([ + "-y", + "-i", + params.mediaPath, + "-ac", + "1", + "-ar", + "16000", + "-c:a", + "pcm_s16le", + wavPath, + ]); + return wavPath; +} + type ProviderQuery = Record; function normalizeProviderQuery( @@ -619,7 +652,12 @@ export async function runCliEntry(params: { const outputDir = await fs.mkdtemp( path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"), ); - const mediaPath = pathResult.path; + const mediaPath = await resolveCliMediaPath({ + capability, + command, + mediaPath: pathResult.path, + outputDir, + }); const outputBase = path.join(outputDir, path.parse(mediaPath).name); const templCtx: MsgContext = {