mirror of https://github.com/openclaw/openclaw.git
fix: handle Telegram audio auto-transcription
This commit is contained in:
parent
121870a085
commit
4f2df617fe
|
|
@ -118,6 +118,7 @@ Docs: https://docs.openclaw.ai
|
|||
- Matrix/direct rooms: stop trusting remote `is_direct`, honor explicit local `is_direct: false` for discovered DM candidates, and avoid extra member-state lookups for shared rooms so DM routing and repair stay aligned. (#57124) Thanks @w-sss.
|
||||
- Agents/sandbox: make remote FS bridge reads pin the parent path and open the file atomically in the helper so read access cannot race path resolution. Thanks @AntAISecurityLab and @vincentkoc.
|
||||
- Exec/env: block Python package index override variables from request-scoped host exec environment sanitization so package fetches cannot be redirected through a caller-supplied index. Thanks @nexrin and @vincentkoc.
|
||||
- Telegram/audio: transcode Telegram voice-note `.ogg` attachments before the local `whisper-cli` auto fallback runs, and keep mention-preflight transcription enabled in auto mode when `tools.media.audio` is unset.
|
||||
|
||||
## 2026.3.28
|
||||
|
||||
|
|
|
|||
|
|
@ -25,12 +25,14 @@ const hasAvailableAuthForProviderMock = vi.hoisted(() =>
|
|||
}),
|
||||
);
|
||||
const fetchRemoteMediaMock = vi.hoisted(() => vi.fn());
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
const runExecMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
|
||||
let clearMediaUnderstandingBinaryCacheForTests: typeof import("./runner.js").clearMediaUnderstandingBinaryCacheForTests;
|
||||
const mockedResolveApiKey = resolveApiKeyForProviderMock;
|
||||
const mockedFetchRemoteMedia = fetchRemoteMediaMock;
|
||||
const mockedRunFfmpeg = runFfmpegMock;
|
||||
const mockedRunExec = runExecMock;
|
||||
|
||||
const TEMP_MEDIA_PREFIX = "openclaw-media-";
|
||||
|
|
@ -255,6 +257,9 @@ describe("applyMediaUnderstanding", () => {
|
|||
vi.doMock("../media/fetch.js", () => ({
|
||||
fetchRemoteMedia: fetchRemoteMediaMock,
|
||||
}));
|
||||
vi.doMock("../media/ffmpeg-exec.js", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
}));
|
||||
vi.doMock("../process/exec.js", () => ({
|
||||
runExec: runExecMock,
|
||||
}));
|
||||
|
|
@ -304,6 +309,7 @@ describe("applyMediaUnderstanding", () => {
|
|||
});
|
||||
hasAvailableAuthForProviderMock.mockClear();
|
||||
mockedFetchRemoteMedia.mockClear();
|
||||
mockedRunFfmpeg.mockReset();
|
||||
mockedRunExec.mockReset();
|
||||
mockedFetchRemoteMedia.mockResolvedValue({
|
||||
buffer: createSafeAudioFixtureBuffer(2048),
|
||||
|
|
@ -703,6 +709,65 @@ describe("applyMediaUnderstanding", () => {
|
|||
);
|
||||
});
|
||||
|
||||
it("transcodes non-wav audio before auto-detected whisper-cli runs", async () => {
|
||||
const binDir = await createTempMediaDir();
|
||||
const modelDir = await createTempMediaDir();
|
||||
await createMockExecutable(binDir, "whisper-cli");
|
||||
const modelPath = path.join(modelDir, "tiny.bin");
|
||||
await fs.writeFile(modelPath, "model");
|
||||
|
||||
const ctx = await createAudioCtx({
|
||||
fileName: "telegram-voice.ogg",
|
||||
mediaType: "audio/ogg",
|
||||
content: createSafeAudioFixtureBuffer(2048),
|
||||
});
|
||||
const cfg: OpenClawConfig = { tools: { media: { audio: {} } } };
|
||||
|
||||
mockedRunFfmpeg.mockImplementationOnce(async (args: string[]) => {
|
||||
const wavPath = args.at(-1);
|
||||
if (typeof wavPath !== "string") {
|
||||
throw new Error("missing wav path");
|
||||
}
|
||||
await fs.writeFile(wavPath, Buffer.from("RIFF"));
|
||||
return "";
|
||||
});
|
||||
mockedRunExec.mockResolvedValueOnce({
|
||||
stdout: "whisper cpp ogg ok\n",
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
await withMediaAutoDetectEnv(
|
||||
{
|
||||
PATH: binDir,
|
||||
WHISPER_CPP_MODEL: modelPath,
|
||||
},
|
||||
async () => {
|
||||
const result = await applyMediaUnderstanding({ ctx, cfg });
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
},
|
||||
);
|
||||
|
||||
expect(ctx.Transcript).toBe("whisper cpp ogg ok");
|
||||
expect(mockedRunFfmpeg).toHaveBeenCalledWith(
|
||||
expect.arrayContaining([
|
||||
"-i",
|
||||
expect.stringMatching(/telegram-voice\.ogg$/),
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-c:a",
|
||||
"pcm_s16le",
|
||||
expect.stringMatching(/telegram-voice\.wav$/),
|
||||
]),
|
||||
);
|
||||
expect(mockedRunExec).toHaveBeenCalledWith(
|
||||
"whisper-cli",
|
||||
expect.arrayContaining([expect.stringMatching(/telegram-voice\.wav$/)]),
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it("skips audio auto-detect when no supported binaries or provider keys are available", async () => {
|
||||
const emptyBinDir = await createTempMediaDir();
|
||||
const isolatedAgentDir = await createTempMediaDir();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const runAudioTranscriptionMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("./audio-transcription-runner.js", () => ({
|
||||
runAudioTranscription: (...args: unknown[]) => runAudioTranscriptionMock(...args),
|
||||
}));
|
||||
|
||||
let transcribeFirstAudio: typeof import("./audio-preflight.js").transcribeFirstAudio;
|
||||
|
||||
describe("transcribeFirstAudio", () => {
|
||||
beforeEach(async () => {
|
||||
vi.resetModules();
|
||||
runAudioTranscriptionMock.mockReset();
|
||||
({ transcribeFirstAudio } = await import("./audio-preflight.js"));
|
||||
});
|
||||
|
||||
it("runs audio preflight in auto mode when audio config is absent", async () => {
|
||||
runAudioTranscriptionMock.mockResolvedValueOnce({
|
||||
transcript: "voice note transcript",
|
||||
attachments: [],
|
||||
});
|
||||
|
||||
const transcript = await transcribeFirstAudio({
|
||||
ctx: {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: "/tmp/voice.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
},
|
||||
cfg: {},
|
||||
});
|
||||
|
||||
expect(transcript).toBe("voice note transcript");
|
||||
expect(runAudioTranscriptionMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it("skips audio preflight when audio config is explicitly disabled", async () => {
|
||||
const transcript = await transcribeFirstAudio({
|
||||
ctx: {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: "/tmp/voice.ogg",
|
||||
MediaType: "audio/ogg",
|
||||
},
|
||||
cfg: {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(transcript).toBeUndefined();
|
||||
expect(runAudioTranscriptionMock).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
|
@ -26,7 +26,7 @@ export async function transcribeFirstAudio(params: {
|
|||
|
||||
// Check if audio transcription is enabled in config
|
||||
const audioConfig = cfg.tools?.media?.audio;
|
||||
if (!audioConfig || audioConfig.enabled === false) {
|
||||
if (audioConfig?.enabled === false) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import type {
|
|||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { resolveProxyFetchFromEnv } from "../infra/net/proxy-fetch.js";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
import { runFfmpeg } from "../media/ffmpeg-exec.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import { MediaAttachmentCache } from "./attachments.js";
|
||||
import {
|
||||
|
|
@ -210,6 +211,38 @@ async function resolveCliOutput(params: {
|
|||
return params.stdout.trim();
|
||||
}
|
||||
|
||||
async function resolveCliMediaPath(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
command: string;
|
||||
mediaPath: string;
|
||||
outputDir: string;
|
||||
}): Promise<string> {
|
||||
const commandId = commandBase(params.command);
|
||||
if (params.capability !== "audio" || commandId !== "whisper-cli") {
|
||||
return params.mediaPath;
|
||||
}
|
||||
|
||||
const ext = path.extname(params.mediaPath).toLowerCase();
|
||||
if (ext === ".wav") {
|
||||
return params.mediaPath;
|
||||
}
|
||||
|
||||
const wavPath = path.join(params.outputDir, `${path.parse(params.mediaPath).name}.wav`);
|
||||
await runFfmpeg([
|
||||
"-y",
|
||||
"-i",
|
||||
params.mediaPath,
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-c:a",
|
||||
"pcm_s16le",
|
||||
wavPath,
|
||||
]);
|
||||
return wavPath;
|
||||
}
|
||||
|
||||
type ProviderQuery = Record<string, string | number | boolean>;
|
||||
|
||||
function normalizeProviderQuery(
|
||||
|
|
@ -619,7 +652,12 @@ export async function runCliEntry(params: {
|
|||
const outputDir = await fs.mkdtemp(
|
||||
path.join(resolvePreferredOpenClawTmpDir(), "openclaw-media-cli-"),
|
||||
);
|
||||
const mediaPath = pathResult.path;
|
||||
const mediaPath = await resolveCliMediaPath({
|
||||
capability,
|
||||
command,
|
||||
mediaPath: pathResult.path,
|
||||
outputDir,
|
||||
});
|
||||
const outputBase = path.join(outputDir, path.parse(mediaPath).name);
|
||||
|
||||
const templCtx: MsgContext = {
|
||||
|
|
|
|||
Loading…
Reference in New Issue