From 4f2df617fee8f5bb292c1f0e83abb6d11658b8d2 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 31 Mar 2026 01:59:17 +0100
Subject: [PATCH] fix: handle Telegram audio auto-transcription

---
 CHANGELOG.md                                  |  1 +
 src/media-understanding/apply.test.ts         | 65 +++++++++++++++++++
 .../audio-preflight.test.ts                   | 58 +++++++++++++++++
 src/media-understanding/audio-preflight.ts    |  2 +-
 src/media-understanding/runner.entries.ts     | 40 +++++++++++-
 5 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100644 src/media-understanding/audio-preflight.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0f8bd025d0..6abb12e7857 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
 - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
 - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
 - Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc.
+- Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset.
 
 ## 2026.3.28
 
diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts
index a6f206a3a7b..b87644e7b81 100644
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -25,12 +25,14 @@ const hasAvailableAuthForProviderMock = vi.hoisted(() =>
   }),
 );
 const fetchRemoteMediaMock = vi.hoisted(() => vi.fn());
+const runFfmpegMock = vi.hoisted(() => vi.fn());
 const runExecMock = vi.hoisted(() => vi.fn());
 
 let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
 let clearMediaUnderstandingBinaryCacheForTests: typeof import("./runner.js").clearMediaUnderstandingBinaryCacheForTests;
 const mockedResolveApiKey = resolveApiKeyForProviderMock;
 const mockedFetchRemoteMedia = fetchRemoteMediaMock;
+const mockedRunFfmpeg = runFfmpegMock;
 const mockedRunExec = runExecMock;
 
 const TEMP_MEDIA_PREFIX = "openclaw-media-";
@@ -255,6 +257,9 @@ describe("applyMediaUnderstanding", () => {
     vi.doMock("../media/fetch.js", () => ({
       fetchRemoteMedia: fetchRemoteMediaMock,
     }));
+    vi.doMock("../media/ffmpeg-exec.js", () => ({
+      runFfmpeg: runFfmpegMock,
+    }));
     vi.doMock("../process/exec.js", () => ({
       runExec: runExecMock,
     }));
@@ -304,6 +309,7 @@ describe("applyMediaUnderstanding", () => {
     });
     hasAvailableAuthForProviderMock.mockClear();
     mockedFetchRemoteMedia.mockClear();
+    mockedRunFfmpeg.mockReset();
     mockedRunExec.mockReset();
     mockedFetchRemoteMedia.mockResolvedValue({
       buffer: createSafeAudioFixtureBuffer(2048),
@@ -703,6 +709,65 @@ describe("applyMediaUnderstanding", () => {
     );
   });
 
+  it("transcodes non-wav audio before auto-detected whisper-cli runs", async () => {
+    const binDir = await createTempMediaDir();
+    const modelDir = await createTempMediaDir();
+    await createMockExecutable(binDir, "whisper-cli");
+    const modelPath = path.join(modelDir, "tiny.bin");
+    await fs.writeFile(modelPath, "model");
+
+    const ctx = await createAudioCtx({
+      fileName: "telegram-voice.ogg",
+      mediaType: "audio/ogg",
+      content: createSafeAudioFixtureBuffer(2048),
+    });
+    const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
+
+    mockedRunFfmpeg.mockImplementationOnce(async (args: string[]) => {
+      const wavPath = args.at(-1);
+      if (typeof wavPath !== "string") {
+        throw new Error("missing wav path");
+      }
+      await fs.writeFile(wavPath, Buffer.from("RIFF"));
+      return "";
+    });
+    mockedRunExec.mockResolvedValueOnce({
+      stdout: "whisper cpp ogg ok\n",
+      stderr: "",
+    });
+
+    await withMediaAutoDetectEnv(
+      {
+        PATH: binDir,
+        WHISPER_CPP_MODEL: modelPath,
+      },
+      async () => {
+        const result = await applyMediaUnderstanding({ ctx, cfg });
+        expect(result.appliedAudio).toBe(true);
+      },
+    );
+
+    expect(ctx.Transcript).toBe("whisper cpp ogg ok");
+    expect(mockedRunFfmpeg).toHaveBeenCalledWith(
+      expect.arrayContaining([
+        "-i",
+        expect.stringMatching(/telegram-voice\.ogg$/),
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        "-c:a",
+        "pcm_s16le",
+        expect.stringMatching(/telegram-voice\.wav$/),
+      ]),
+    );
+    expect(mockedRunExec).toHaveBeenCalledWith(
+      "whisper-cli",
+      expect.arrayContaining([expect.stringMatching(/telegram-voice\.wav$/)]),
+      expect.any(Object),
+    );
+  });
+
   it("skips audio auto-detect when no supported binaries or provider keys are available", async () => {
     const emptyBinDir = await createTempMediaDir();
     const isolatedAgentDir = await createTempMediaDir();
diff --git a/src/media-understanding/audio-preflight.test.ts b/src/media-understanding/audio-preflight.test.ts
new file mode 100644
index 00000000000..32ceb85ddfc
--- /dev/null
+++ b/src/media-understanding/audio-preflight.test.ts
@@ -0,0 +1,58 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+const runAudioTranscriptionMock = vi.hoisted(() => vi.fn());
+
+vi.mock("./audio-transcription-runner.js", () => ({
+  runAudioTranscription: (...args: unknown[]) => runAudioTranscriptionMock(...args),
+}));
+
+let transcribeFirstAudio: typeof import("./audio-preflight.js").transcribeFirstAudio;
+
+describe("transcribeFirstAudio", () => {
+  beforeEach(async () => {
+    vi.resetModules();
+    runAudioTranscriptionMock.mockReset();
+    ({ transcribeFirstAudio } = await import("./audio-preflight.js"));
+  });
+
+  it("runs audio preflight in auto mode when audio config is absent", async () => {
+    runAudioTranscriptionMock.mockResolvedValueOnce({
+      transcript: "voice note transcript",
+      attachments: [],
+    });
+
+    const transcript = await transcribeFirstAudio({
+      ctx: {
+        Body: "<media:audio>",
+        MediaPath: "/tmp/voice.ogg",
+        MediaType: "audio/ogg",
+      },
+      cfg: {},
+    });
+
+    expect(transcript).toBe("voice note transcript");
+    expect(runAudioTranscriptionMock).toHaveBeenCalledTimes(1);
+  });
+
+  it("skips audio preflight when audio config is explicitly disabled", async () => {
+    const transcript = await transcribeFirstAudio({
+      ctx: {
+        Body: "<media:audio>",
+        MediaPath: "/tmp/voice.ogg",
+        MediaType: "audio/ogg",
+      },
+      cfg: {
+        tools: {
+          media: {
+            audio: {
+              enabled: false,
+            },
+          },
+        },
+      },
+    });
+
+    expect(transcript).toBeUndefined();
+    expect(runAudioTranscriptionMock).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/media-understanding/audio-preflight.ts b/src/media-understanding/audio-preflight.ts
index 735f921510c..19e0dd88626 100644
--- a/src/media-understanding/audio-preflight.ts
+++ b/src/media-understanding/audio-preflight.ts
@@ -26,7 +26,7 @@ export async function transcribeFirstAudio(params: {
 
   // Check if audio transcription is enabled in config
   const audioConfig = cfg.tools?.media?.audio;
-  if (!audioConfig || audioConfig.enabled === false) {
+  if (audioConfig?.enabled === false) {
     return undefined;
   }
 
diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts
index fffe4aa1e0f..98c148e8dd4 100644
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -15,6 +15,7 @@ import type {
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
 import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+import { runFfmpeg } from "../media/ffmpeg-exec.js";
 import { runExec } from "../process/exec.js";
 import { MediaAttachmentCache } from "./attachments.js";
 import {
@@ -210,6 +211,38 @@ async function resolveCliOutput(params: {
   return params.stdout.trim();
 }
 
+async function resolveCliMediaPath(params: {
+  capability: MediaUnderstandingCapability;
+  command: string;
+  mediaPath: string;
+  outputDir: string;
+}): Promise<string> {
+  const commandId = commandBase(params.command);
+  if (params.capability !== "audio" || commandId !== "whisper-cli") {
+    return params.mediaPath;
+  }
+
+  const ext = path.extname(params.mediaPath).toLowerCase();
+  if (ext === ".wav") {
+    return params.mediaPath;
+  }
+
+  const wavPath = path.join(params.outputDir, `${path.parse(params.mediaPath).name}.wav`);
+  await runFfmpeg([
+    "-y",
+    "-i",
+    params.mediaPath,
+    "-ac",
+    "1",
+    "-ar",
+    "16000",
+    "-c:a",
+    "pcm_s16le",
+    wavPath,
+  ]);
+  return wavPath;
+}
+
 type ProviderQuery = Record<string, string | number | boolean>;
 
 function normalizeProviderQuery(
@@ -619,7 +652,12 @@ export async function runCliEntry(params: {
   const outputDir = await fs.mkdtemp(
     path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
   );
-  const mediaPath = pathResult.path;
+  const mediaPath = await resolveCliMediaPath({
+    capability,
+    command,
+    mediaPath: pathResult.path,
+    outputDir,
+  });
   const outputBase = path.join(outputDir, path.parse(mediaPath).name);
 
   const templCtx: MsgContext = {