fix(tts): use Chinese voice for CJK text in edge-tts provider (openclaw#52355)

Verified:
- pnpm test -- extensions/microsoft/speech-provider.test.ts extensions/microsoft/tts.test.ts

Notes:
- Rebases and refactor-port completed onto current main.
- No required GitHub checks were reported for this branch at merge time.

Co-authored-by: Extra Small <littleshuai.bot@gmail.com>
This commit is contained in:
Extra Small 2026-03-28 19:06:48 -07:00 committed by GitHub
parent f1970b8aef
commit 69a0a0edc5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 130 additions and 12 deletions

View File

@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels.
- TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0.
- macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
- Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
- Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.

View File

@ -1,5 +1,11 @@
import { writeFileSync } from "node:fs";
import { afterEach, describe, expect, it, vi } from "vitest";
import { listMicrosoftVoices } from "./speech-provider.js";
import {
buildMicrosoftSpeechProvider,
isCjkDominant,
listMicrosoftVoices,
} from "./speech-provider.js";
import * as ttsModule from "./tts.js";
describe("listMicrosoftVoices", () => {
const originalFetch = globalThis.fetch;
@ -41,16 +47,6 @@ describe("listMicrosoftVoices", () => {
personalities: ["Friendly", "Positive"],
},
]);
expect(globalThis.fetch).toHaveBeenCalledWith(
expect.stringContaining("/voices/list?trustedclienttoken="),
expect.objectContaining({
headers: expect.objectContaining({
Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
"Sec-MS-GEC": expect.any(String),
"Sec-MS-GEC-Version": expect.stringContaining("1-"),
}),
}),
);
});
it("throws on Microsoft voice list failures", async () => {
@ -63,3 +59,93 @@ describe("listMicrosoftVoices", () => {
await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
});
});
describe("isCjkDominant", () => {
it("returns true for Chinese text", () => {
expect(isCjkDominant("你好世界")).toBe(true);
});
it("returns true for mixed text with majority CJK", () => {
expect(isCjkDominant("你好,这是一个测试 hello")).toBe(true);
});
it("returns false for English text", () => {
expect(isCjkDominant("Hello, this is a test")).toBe(false);
});
it("returns false for empty string", () => {
expect(isCjkDominant("")).toBe(false);
});
it("returns false for mostly English with a few CJK chars", () => {
expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false);
});
});
describe("buildMicrosoftSpeechProvider", () => {
afterEach(() => {
vi.restoreAllMocks();
});
it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => {
const provider = buildMicrosoftSpeechProvider();
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
});
await provider.synthesize({
text: "你好,这是一个测试 hello",
providerConfig: {
enabled: true,
voice: "en-US-MichelleNeural",
lang: "en-US",
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
outputFormatConfigured: true,
saveSubtitles: false,
},
providerOverrides: {},
timeoutMs: 1000,
target: "audio",
});
expect(edgeSpy).toHaveBeenCalledWith(
expect.objectContaining({
config: expect.objectContaining({
voice: "zh-CN-XiaoxiaoNeural",
lang: "zh-CN",
}),
}),
);
});
it("preserves an explicitly configured English voice for CJK text", async () => {
const provider = buildMicrosoftSpeechProvider();
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
});
await provider.synthesize({
text: "你好,这是一个测试 hello",
providerConfig: {
enabled: true,
voice: "en-US-AvaNeural",
lang: "en-US",
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
outputFormatConfigured: true,
saveSubtitles: false,
},
providerOverrides: {},
timeoutMs: 1000,
target: "audio",
});
expect(edgeSpy).toHaveBeenCalledWith(
expect.objectContaining({
config: expect.objectContaining({
voice: "en-US-AvaNeural",
lang: "en-US",
}),
}),
);
});
});

View File

@ -122,6 +122,29 @@ function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string
return personalities.length > 0 ? personalities.join(", ") : undefined;
}
export function isCjkDominant(text: string): boolean {
const stripped = text.replace(/\s+/g, "");
if (stripped.length === 0) {
return false;
}
let cjkCount = 0;
for (const ch of stripped) {
const code = ch.codePointAt(0) ?? 0;
if (
(code >= 0x4e00 && code <= 0x9fff) ||
(code >= 0x3400 && code <= 0x4dbf) ||
(code >= 0x3000 && code <= 0x303f) ||
(code >= 0xff00 && code <= 0xffef)
) {
cjkCount += 1;
}
}
return cjkCount / stripped.length > 0.3;
}
const DEFAULT_CHINESE_EDGE_VOICE = "zh-CN-XiaoxiaoNeural";
const DEFAULT_CHINESE_EDGE_LANG = "zh-CN";
export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
const response = await fetch(
"https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
@ -205,11 +228,18 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
let voice = overrideVoice ?? config.voice;
let lang = config.lang;
let outputFormat =
trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
const fallbackOutputFormat =
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
if (!overrideVoice && voice === DEFAULT_EDGE_VOICE && isCjkDominant(req.text)) {
voice = DEFAULT_CHINESE_EDGE_VOICE;
lang = DEFAULT_CHINESE_EDGE_LANG;
}
try {
const runEdge = async (format: string) => {
const fileExtension = inferEdgeExtension(format);
@ -219,7 +249,8 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
outputPath,
config: {
...config,
voice: overrideVoice ?? config.voice,
voice,
lang,
outputFormat: format,
},
timeoutMs: req.timeoutMs,