fix: handle Telegram audio auto-transcription

2026-03-31 01:59:17 +01:00 · 2026-03-31 01:59:17 +01:00 · 4f2df617fe
parent 121870a085
commit 4f2df617fe
5 changed files with 164 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
 - Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
 - Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
 - Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc.
+- Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset.

 ## 2026.3.28

--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@ -25,12 +25,14 @@ const hasAvailableAuthForProviderMock = vi.hoisted(() =>
  }),
 );
 const fetchRemoteMediaMock = vi.hoisted(() => vi.fn());
+const runFfmpegMock = vi.hoisted(() => vi.fn());
 const runExecMock = vi.hoisted(() => vi.fn());

 let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
 let clearMediaUnderstandingBinaryCacheForTests: typeof import("./runner.js").clearMediaUnderstandingBinaryCacheForTests;
 const mockedResolveApiKey = resolveApiKeyForProviderMock;
 const mockedFetchRemoteMedia = fetchRemoteMediaMock;
+const mockedRunFfmpeg = runFfmpegMock;
 const mockedRunExec = runExecMock;

 const TEMP_MEDIA_PREFIX = "openclaw-media-";
@ -255,6 +257,9 @@ describe("applyMediaUnderstanding", () => {
    vi.doMock("../media/fetch.js", () => ({
      fetchRemoteMedia: fetchRemoteMediaMock,
    }));
+    vi.doMock("../media/ffmpeg-exec.js", () => ({
+      runFfmpeg: runFfmpegMock,
+    }));
    vi.doMock("../process/exec.js", () => ({
      runExec: runExecMock,
    }));
@ -304,6 +309,7 @@ describe("applyMediaUnderstanding", () => {
    });
    hasAvailableAuthForProviderMock.mockClear();
    mockedFetchRemoteMedia.mockClear();
+    mockedRunFfmpeg.mockReset();
    mockedRunExec.mockReset();
    mockedFetchRemoteMedia.mockResolvedValue({
      buffer: createSafeAudioFixtureBuffer(2048),
@ -703,6 +709,65 @@ describe("applyMediaUnderstanding", () => {
    );
  });

+  it("transcodes non-wav audio before auto-detected whisper-cli runs", async () => {
+    const binDir = await createTempMediaDir();
+    const modelDir = await createTempMediaDir();
+    await createMockExecutable(binDir, "whisper-cli");
+    const modelPath = path.join(modelDir, "tiny.bin");
+    await fs.writeFile(modelPath, "model");
+
+    const ctx = await createAudioCtx({
+      fileName: "telegram-voice.ogg",
+      mediaType: "audio/ogg",
+      content: createSafeAudioFixtureBuffer(2048),
+    });
+    const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
+
+    mockedRunFfmpeg.mockImplementationOnce(async (args: string[]) => {
+      const wavPath = args.at(-1);
+      if (typeof wavPath !== "string") {
+        throw new Error("missing wav path");
+      }
+      await fs.writeFile(wavPath, Buffer.from("RIFF"));
+      return "";
+    });
+    mockedRunExec.mockResolvedValueOnce({
+      stdout: "whisper cpp ogg ok\n",
+      stderr: "",
+    });
+
+    await withMediaAutoDetectEnv(
+      {
+        PATH: binDir,
+        WHISPER_CPP_MODEL: modelPath,
+      },
+      async () => {
+        const result = await applyMediaUnderstanding({ ctx, cfg });
+        expect(result.appliedAudio).toBe(true);
+      },
+    );
+
+    expect(ctx.Transcript).toBe("whisper cpp ogg ok");
+    expect(mockedRunFfmpeg).toHaveBeenCalledWith(
+      expect.arrayContaining([
+        "-i",
+        expect.stringMatching(/telegram-voice\.ogg$/),
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        "-c:a",
+        "pcm_s16le",
+        expect.stringMatching(/telegram-voice\.wav$/),
+      ]),
+    );
+    expect(mockedRunExec).toHaveBeenCalledWith(
+      "whisper-cli",
+      expect.arrayContaining([expect.stringMatching(/telegram-voice\.wav$/)]),
+      expect.any(Object),
+    );
+  });
+
  it("skips audio auto-detect when no supported binaries or provider keys are available", async () => {
    const emptyBinDir = await createTempMediaDir();
    const isolatedAgentDir = await createTempMediaDir();
--- a/src/media-understanding/audio-preflight.test.ts
+++ b/src/media-understanding/audio-preflight.test.ts
@ -0,0 +1,58 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+const runAudioTranscriptionMock = vi.hoisted(() => vi.fn());
+
+vi.mock("./audio-transcription-runner.js", () => ({
+  runAudioTranscription: (...args: unknown[]) => runAudioTranscriptionMock(...args),
+}));
+
+let transcribeFirstAudio: typeof import("./audio-preflight.js").transcribeFirstAudio;
+
+describe("transcribeFirstAudio", () => {
+  beforeEach(async () => {
+    vi.resetModules();
+    runAudioTranscriptionMock.mockReset();
+    ({ transcribeFirstAudio } = await import("./audio-preflight.js"));
+  });
+
+  it("runs audio preflight in auto mode when audio config is absent", async () => {
+    runAudioTranscriptionMock.mockResolvedValueOnce({
+      transcript: "voice note transcript",
+      attachments: [],
+    });
+
+    const transcript = await transcribeFirstAudio({
+      ctx: {
+        Body: "<media:audio>",
+        MediaPath: "/tmp/voice.ogg",
+        MediaType: "audio/ogg",
+      },
+      cfg: {},
+    });
+
+    expect(transcript).toBe("voice note transcript");
+    expect(runAudioTranscriptionMock).toHaveBeenCalledTimes(1);
+  });
+
+  it("skips audio preflight when audio config is explicitly disabled", async () => {
+    const transcript = await transcribeFirstAudio({
+      ctx: {
+        Body: "<media:audio>",
+        MediaPath: "/tmp/voice.ogg",
+        MediaType: "audio/ogg",
+      },
+      cfg: {
+        tools: {
+          media: {
+            audio: {
+              enabled: false,
+            },
+          },
+        },
+      },
+    });
+
+    expect(transcript).toBeUndefined();
+    expect(runAudioTranscriptionMock).not.toHaveBeenCalled();
+  });
+});
--- a/src/media-understanding/audio-preflight.ts
+++ b/src/media-understanding/audio-preflight.ts
@ -26,7 +26,7 @@ export async function transcribeFirstAudio(params: {

  // Check if audio transcription is enabled in config
  const audioConfig = cfg.tools?.media?.audio;
-  if (!audioConfig || audioConfig.enabled === false) {
+  if (audioConfig?.enabled === false) {
    return undefined;
  }

--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@ -15,6 +15,7 @@ import type {
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
 import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+import { runFfmpeg } from "../media/ffmpeg-exec.js";
 import { runExec } from "../process/exec.js";
 import { MediaAttachmentCache } from "./attachments.js";
 import {
@ -210,6 +211,38 @@ async function resolveCliOutput(params: {
  return params.stdout.trim();
 }

+async function resolveCliMediaPath(params: {
+  capability: MediaUnderstandingCapability;
+  command: string;
+  mediaPath: string;
+  outputDir: string;
+}): Promise<string> {
+  const commandId = commandBase(params.command);
+  if (params.capability !== "audio" || commandId !== "whisper-cli") {
+    return params.mediaPath;
+  }
+
+  const ext = path.extname(params.mediaPath).toLowerCase();
+  if (ext === ".wav") {
+    return params.mediaPath;
+  }
+
+  const wavPath = path.join(params.outputDir, `${path.parse(params.mediaPath).name}.wav`);
+  await runFfmpeg([
+    "-y",
+    "-i",
+    params.mediaPath,
+    "-ac",
+    "1",
+    "-ar",
+    "16000",
+    "-c:a",
+    "pcm_s16le",
+    wavPath,
+  ]);
+  return wavPath;
+}
+
 type ProviderQuery = Record<string, string | number | boolean>;

 function normalizeProviderQuery(
@ -619,7 +652,12 @@ export async function runCliEntry(params: {
  const outputDir = await fs.mkdtemp(
    path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
  );
-  const mediaPath = pathResult.path;
+  const mediaPath = await resolveCliMediaPath({
+    capability,
+    command,
+    mediaPath: pathResult.path,
+    outputDir,
+  });
  const outputBase = path.join(outputDir, path.parse(mediaPath).name);

  const templCtx: MsgContext = {