diff --git a/CHANGELOG.md b/CHANGELOG.md index c2e6daf8e9f..8efc07164cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai ### Fixes - LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels. +- TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0. - macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape. - Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x. - Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00. diff --git a/extensions/microsoft/speech-provider.test.ts b/extensions/microsoft/speech-provider.test.ts index 19f5a36ddd7..0a16d6a09ba 100644 --- a/extensions/microsoft/speech-provider.test.ts +++ b/extensions/microsoft/speech-provider.test.ts @@ -1,5 +1,11 @@ +import { writeFileSync } from "node:fs"; import { afterEach, describe, expect, it, vi } from "vitest"; -import { listMicrosoftVoices } from "./speech-provider.js"; +import { + buildMicrosoftSpeechProvider, + isCjkDominant, + listMicrosoftVoices, +} from "./speech-provider.js"; +import * as ttsModule from "./tts.js"; describe("listMicrosoftVoices", () => { const originalFetch = globalThis.fetch; @@ -41,16 +47,6 @@ describe("listMicrosoftVoices", () => { personalities: ["Friendly", "Positive"], }, ]); - expect(globalThis.fetch).toHaveBeenCalledWith( - expect.stringContaining("/voices/list?trustedclienttoken="), - expect.objectContaining({ - headers: expect.objectContaining({ - Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", - "Sec-MS-GEC": expect.any(String), - "Sec-MS-GEC-Version": expect.stringContaining("1-"), - }), - }), - ); }); it("throws on Microsoft voice list failures", async () => { @@ -63,3 +59,93 @@ describe("listMicrosoftVoices", () => { await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)"); }); }); + +describe("isCjkDominant", () => { + it("returns true for Chinese text", () => { + expect(isCjkDominant("你好世界")).toBe(true); + }); + + it("returns true for mixed text with majority CJK", () => { + expect(isCjkDominant("你好,这是一个测试 hello")).toBe(true); + }); + + it("returns false for English text", () => { + expect(isCjkDominant("Hello, this is a test")).toBe(false); + }); + + it("returns false for empty string", () => { + expect(isCjkDominant("")).toBe(false); + }); + + it("returns false for mostly English with a few CJK chars", () => { + expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false); + }); +}); + +describe("buildMicrosoftSpeechProvider", () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => { + const provider = buildMicrosoftSpeechProvider(); + const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => { + writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + }); + + await provider.synthesize({ + text: "你好,这是一个测试 hello", + providerConfig: { + enabled: true, + voice: "en-US-MichelleNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + outputFormatConfigured: true, + saveSubtitles: false, + }, + providerOverrides: {}, + timeoutMs: 1000, + target: "audio", + }); + + expect(edgeSpy).toHaveBeenCalledWith( + expect.objectContaining({ + config: expect.objectContaining({ + voice: "zh-CN-XiaoxiaoNeural", + lang: "zh-CN", + }), + }), + ); + }); + + it("preserves an explicitly configured English voice for CJK text", async () => { + const provider = buildMicrosoftSpeechProvider(); + const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => { + writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00])); + }); + + await provider.synthesize({ + text: "你好,这是一个测试 hello", + providerConfig: { + enabled: true, + voice: "en-US-AvaNeural", + lang: "en-US", + outputFormat: "audio-24khz-48kbitrate-mono-mp3", + outputFormatConfigured: true, + saveSubtitles: false, + }, + providerOverrides: {}, + timeoutMs: 1000, + target: "audio", + }); + + expect(edgeSpy).toHaveBeenCalledWith( + expect.objectContaining({ + config: expect.objectContaining({ + voice: "en-US-AvaNeural", + lang: "en-US", + }), + }), + ); + }); +}); diff --git a/extensions/microsoft/speech-provider.ts b/extensions/microsoft/speech-provider.ts index f27ee10451f..d4fe6dc74d0 100644 --- a/extensions/microsoft/speech-provider.ts +++ b/extensions/microsoft/speech-provider.ts @@ -122,6 +122,29 @@ function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string return personalities.length > 0 ? personalities.join(", ") : undefined; } +export function isCjkDominant(text: string): boolean { + const stripped = text.replace(/\s+/g, ""); + if (stripped.length === 0) { + return false; + } + let cjkCount = 0; + for (const ch of stripped) { + const code = ch.codePointAt(0) ?? 0; + if ( + (code >= 0x4e00 && code <= 0x9fff) || + (code >= 0x3400 && code <= 0x4dbf) || + (code >= 0x3000 && code <= 0x303f) || + (code >= 0xff00 && code <= 0xffef) + ) { + cjkCount += 1; + } + } + return cjkCount / stripped.length > 0.3; +} + +const DEFAULT_CHINESE_EDGE_VOICE = "zh-CN-XiaoxiaoNeural"; +const DEFAULT_CHINESE_EDGE_LANG = "zh-CN"; + export async function listMicrosoftVoices(): Promise { const response = await fetch( "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" + @@ -205,11 +228,18 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { mkdirSync(tempRoot, { recursive: true, mode: 0o700 }); const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-")); const overrideVoice = trimToUndefined(req.providerOverrides?.voice); + let voice = overrideVoice ?? config.voice; + let lang = config.lang; let outputFormat = trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat; const fallbackOutputFormat = outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined; + if (!overrideVoice && voice === DEFAULT_EDGE_VOICE && isCjkDominant(req.text)) { + voice = DEFAULT_CHINESE_EDGE_VOICE; + lang = DEFAULT_CHINESE_EDGE_LANG; + } + try { const runEdge = async (format: string) => { const fileExtension = inferEdgeExtension(format); @@ -219,7 +249,8 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin { outputPath, config: { ...config, - voice: overrideVoice ?? config.voice, + voice, + lang, outputFormat: format, }, timeoutMs: req.timeoutMs,