fix: handle Telegram audio auto-transcription

This commit is contained in:
Peter Steinberger 2026-03-31 01:59:17 +01:00
parent 121870a085
commit 4f2df617fe
No known key found for this signature in database
5 changed files with 164 additions and 2 deletions

View File

@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
- Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
- Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
- Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc.
- Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset.
## 2026.3.28

View File

@ -25,12 +25,14 @@ const hasAvailableAuthForProviderMock = vi.hoisted(() =>
}),
);
const fetchRemoteMediaMock = vi.hoisted(() => vi.fn());
const runFfmpegMock = vi.hoisted(() => vi.fn());
const runExecMock = vi.hoisted(() => vi.fn());
let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
let clearMediaUnderstandingBinaryCacheForTests: typeof import("./runner.js").clearMediaUnderstandingBinaryCacheForTests;
const mockedResolveApiKey = resolveApiKeyForProviderMock;
const mockedFetchRemoteMedia = fetchRemoteMediaMock;
const mockedRunFfmpeg = runFfmpegMock;
const mockedRunExec = runExecMock;
const TEMP_MEDIA_PREFIX = "openclaw-media-";
@ -255,6 +257,9 @@ describe("applyMediaUnderstanding", () => {
vi.doMock("../media/fetch.js", () => ({
fetchRemoteMedia: fetchRemoteMediaMock,
}));
vi.doMock("../media/ffmpeg-exec.js", () => ({
runFfmpeg: runFfmpegMock,
}));
vi.doMock("../process/exec.js", () => ({
runExec: runExecMock,
}));
@ -304,6 +309,7 @@ describe("applyMediaUnderstanding", () => {
});
hasAvailableAuthForProviderMock.mockClear();
mockedFetchRemoteMedia.mockClear();
mockedRunFfmpeg.mockReset();
mockedRunExec.mockReset();
mockedFetchRemoteMedia.mockResolvedValue({
buffer: createSafeAudioFixtureBuffer(2048),
@ -703,6 +709,65 @@ describe("applyMediaUnderstanding", () => {
);
});
it("transcodes non-wav audio before auto-detected whisper-cli runs", async () => {
const binDir = await createTempMediaDir();
const modelDir = await createTempMediaDir();
await createMockExecutable(binDir, "whisper-cli");
const modelPath = path.join(modelDir, "tiny.bin");
await fs.writeFile(modelPath, "model");
const ctx = await createAudioCtx({
fileName: "telegram-voice.ogg",
mediaType: "audio/ogg",
content: createSafeAudioFixtureBuffer(2048),
});
const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
mockedRunFfmpeg.mockImplementationOnce(async (args: string[]) => {
const wavPath = args.at(-1);
if (typeof wavPath !== "string") {
throw new Error("missing wav path");
}
await fs.writeFile(wavPath, Buffer.from("RIFF"));
return "";
});
mockedRunExec.mockResolvedValueOnce({
stdout: "whisper cpp ogg ok\n",
stderr: "",
});
await withMediaAutoDetectEnv(
{
PATH: binDir,
WHISPER_CPP_MODEL: modelPath,
},
async () => {
const result = await applyMediaUnderstanding({ ctx, cfg });
expect(result.appliedAudio).toBe(true);
},
);
expect(ctx.Transcript).toBe("whisper cpp ogg ok");
expect(mockedRunFfmpeg).toHaveBeenCalledWith(
expect.arrayContaining([
"-i",
expect.stringMatching(/telegram-voice\.ogg$/),
"-ac",
"1",
"-ar",
"16000",
"-c:a",
"pcm_s16le",
expect.stringMatching(/telegram-voice\.wav$/),
]),
);
expect(mockedRunExec).toHaveBeenCalledWith(
"whisper-cli",
expect.arrayContaining([expect.stringMatching(/telegram-voice\.wav$/)]),
expect.any(Object),
);
});
it("skips audio auto-detect when no supported binaries or provider keys are available", async () => {
const emptyBinDir = await createTempMediaDir();
const isolatedAgentDir = await createTempMediaDir();

View File

@ -0,0 +1,58 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
const runAudioTranscriptionMock = vi.hoisted(() => vi.fn());
vi.mock("./audio-transcription-runner.js", () => ({
runAudioTranscription: (...args: unknown[]) => runAudioTranscriptionMock(...args),
}));
let transcribeFirstAudio: typeof import("./audio-preflight.js").transcribeFirstAudio;
describe("transcribeFirstAudio", () => {
beforeEach(async () => {
vi.resetModules();
runAudioTranscriptionMock.mockReset();
({ transcribeFirstAudio } = await import("./audio-preflight.js"));
});
it("runs audio preflight in auto mode when audio config is absent", async () => {
runAudioTranscriptionMock.mockResolvedValueOnce({
transcript: "voice note transcript",
attachments: [],
});
const transcript = await transcribeFirstAudio({
ctx: {
Body: "<media:audio>",
MediaPath: "/tmp/voice.ogg",
MediaType: "audio/ogg",
},
cfg: {},
});
expect(transcript).toBe("voice note transcript");
expect(runAudioTranscriptionMock).toHaveBeenCalledTimes(1);
});
it("skips audio preflight when audio config is explicitly disabled", async () => {
const transcript = await transcribeFirstAudio({
ctx: {
Body: "<media:audio>",
MediaPath: "/tmp/voice.ogg",
MediaType: "audio/ogg",
},
cfg: {
tools: {
media: {
audio: {
enabled: false,
},
},
},
},
});
expect(transcript).toBeUndefined();
expect(runAudioTranscriptionMock).not.toHaveBeenCalled();
});
});

View File

@ -26,7 +26,7 @@ export async function transcribeFirstAudio(params: {
// Check if audio transcription is enabled in config
const audioConfig = cfg.tools?.media?.audio;
if (!audioConfig || audioConfig.enabled === false) {
if (audioConfig?.enabled === false) {
return undefined;
}

View File

@ -15,6 +15,7 @@ import type {
import { logVerbose, shouldLogVerbose } from "../globals.js";
import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { runFfmpeg } from "../media/ffmpeg-exec.js";
import { runExec } from "../process/exec.js";
import { MediaAttachmentCache } from "./attachments.js";
import {
@ -210,6 +211,38 @@ async function resolveCliOutput(params: {
return params.stdout.trim();
}
async function resolveCliMediaPath(params: {
capability: MediaUnderstandingCapability;
command: string;
mediaPath: string;
outputDir: string;
}): Promise<string> {
const commandId = commandBase(params.command);
if (params.capability !== "audio" || commandId !== "whisper-cli") {
return params.mediaPath;
}
const ext = path.extname(params.mediaPath).toLowerCase();
if (ext === ".wav") {
return params.mediaPath;
}
const wavPath = path.join(params.outputDir, `${path.parse(params.mediaPath).name}.wav`);
await runFfmpeg([
"-y",
"-i",
params.mediaPath,
"-ac",
"1",
"-ar",
"16000",
"-c:a",
"pcm_s16le",
wavPath,
]);
return wavPath;
}
type ProviderQuery = Record<string, string | number | boolean>;
function normalizeProviderQuery(
@ -619,7 +652,12 @@ export async function runCliEntry(params: {
const outputDir = await fs.mkdtemp(
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
);
const mediaPath = pathResult.path;
const mediaPath = await resolveCliMediaPath({
capability,
command,
mediaPath: pathResult.path,
outputDir,
});
const outputBase = path.join(outputDir, path.parse(mediaPath).name);
const templCtx: MsgContext = {