fix(tts): use Chinese voice for CJK text in edge-tts provider (openclaw#52355)

Verified: - pnpm test -- extensions/microsoft/speech-provider.test.ts extensions/microsoft/tts.test.ts Notes: - Rebases and refactor-port completed onto current main. - No required GitHub checks were reported for this branch at merge time. Co-authored-by: Extra Small <littleshuai.bot@gmail.com>
2026-03-28 19:06:48 -07:00 · 2026-03-28 19:06:48 -07:00 · 69a0a0edc5
parent f1970b8aef
commit 69a0a0edc5
3 changed files with 130 additions and 12 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels.
+- TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0.
 - macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
 - Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
 - Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.
--- a/extensions/microsoft/speech-provider.test.ts
+++ b/extensions/microsoft/speech-provider.test.ts
@ -1,5 +1,11 @@
+import { writeFileSync } from "node:fs";
 import { afterEach, describe, expect, it, vi } from "vitest";
-import { listMicrosoftVoices } from "./speech-provider.js";
+import {
+  buildMicrosoftSpeechProvider,
+  isCjkDominant,
+  listMicrosoftVoices,
+} from "./speech-provider.js";
+import * as ttsModule from "./tts.js";

 describe("listMicrosoftVoices", () => {
  const originalFetch = globalThis.fetch;
@ -41,16 +47,6 @@ describe("listMicrosoftVoices", () => {
        personalities: ["Friendly", "Positive"],
      },
    ]);
-    expect(globalThis.fetch).toHaveBeenCalledWith(
-      expect.stringContaining("/voices/list?trustedclienttoken="),
-      expect.objectContaining({
-        headers: expect.objectContaining({
-          Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-          "Sec-MS-GEC": expect.any(String),
-          "Sec-MS-GEC-Version": expect.stringContaining("1-"),
-        }),
-      }),
-    );
  });

  it("throws on Microsoft voice list failures", async () => {
@ -63,3 +59,93 @@ describe("listMicrosoftVoices", () => {
    await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
  });
 });
+
+describe("isCjkDominant", () => {
+  it("returns true for Chinese text", () => {
+    expect(isCjkDominant("你好世界")).toBe(true);
+  });
+
+  it("returns true for mixed text with majority CJK", () => {
+    expect(isCjkDominant("你好，这是一个测试 hello")).toBe(true);
+  });
+
+  it("returns false for English text", () => {
+    expect(isCjkDominant("Hello, this is a test")).toBe(false);
+  });
+
+  it("returns false for empty string", () => {
+    expect(isCjkDominant("")).toBe(false);
+  });
+
+  it("returns false for mostly English with a few CJK chars", () => {
+    expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false);
+  });
+});
+
+describe("buildMicrosoftSpeechProvider", () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => {
+    const provider = buildMicrosoftSpeechProvider();
+    const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
+      writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
+    });
+
+    await provider.synthesize({
+      text: "你好，这是一个测试 hello",
+      providerConfig: {
+        enabled: true,
+        voice: "en-US-MichelleNeural",
+        lang: "en-US",
+        outputFormat: "audio-24khz-48kbitrate-mono-mp3",
+        outputFormatConfigured: true,
+        saveSubtitles: false,
+      },
+      providerOverrides: {},
+      timeoutMs: 1000,
+      target: "audio",
+    });
+
+    expect(edgeSpy).toHaveBeenCalledWith(
+      expect.objectContaining({
+        config: expect.objectContaining({
+          voice: "zh-CN-XiaoxiaoNeural",
+          lang: "zh-CN",
+        }),
+      }),
+    );
+  });
+
+  it("preserves an explicitly configured English voice for CJK text", async () => {
+    const provider = buildMicrosoftSpeechProvider();
+    const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
+      writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
+    });
+
+    await provider.synthesize({
+      text: "你好，这是一个测试 hello",
+      providerConfig: {
+        enabled: true,
+        voice: "en-US-AvaNeural",
+        lang: "en-US",
+        outputFormat: "audio-24khz-48kbitrate-mono-mp3",
+        outputFormatConfigured: true,
+        saveSubtitles: false,
+      },
+      providerOverrides: {},
+      timeoutMs: 1000,
+      target: "audio",
+    });
+
+    expect(edgeSpy).toHaveBeenCalledWith(
+      expect.objectContaining({
+        config: expect.objectContaining({
+          voice: "en-US-AvaNeural",
+          lang: "en-US",
+        }),
+      }),
+    );
+  });
+});
--- a/extensions/microsoft/speech-provider.ts
+++ b/extensions/microsoft/speech-provider.ts
@ -122,6 +122,29 @@ function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string
  return personalities.length > 0 ? personalities.join(", ") : undefined;
 }

+export function isCjkDominant(text: string): boolean {
+  const stripped = text.replace(/\s+/g, "");
+  if (stripped.length === 0) {
+    return false;
+  }
+  let cjkCount = 0;
+  for (const ch of stripped) {
+    const code = ch.codePointAt(0) ?? 0;
+    if (
+      (code >= 0x4e00 && code <= 0x9fff) ||
+      (code >= 0x3400 && code <= 0x4dbf) ||
+      (code >= 0x3000 && code <= 0x303f) ||
+      (code >= 0xff00 && code <= 0xffef)
+    ) {
+      cjkCount += 1;
+    }
+  }
+  return cjkCount / stripped.length > 0.3;
+}
+
+const DEFAULT_CHINESE_EDGE_VOICE = "zh-CN-XiaoxiaoNeural";
+const DEFAULT_CHINESE_EDGE_LANG = "zh-CN";
+
 export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
  const response = await fetch(
    "https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
@ -205,11 +228,18 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
      mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
      const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
      const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
+      let voice = overrideVoice ?? config.voice;
+      let lang = config.lang;
      let outputFormat =
        trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
      const fallbackOutputFormat =
        outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;

+      if (!overrideVoice && voice === DEFAULT_EDGE_VOICE && isCjkDominant(req.text)) {
+        voice = DEFAULT_CHINESE_EDGE_VOICE;
+        lang = DEFAULT_CHINESE_EDGE_LANG;
+      }
+
      try {
        const runEdge = async (format: string) => {
          const fileExtension = inferEdgeExtension(format);
@ -219,7 +249,8 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
            outputPath,
            config: {
              ...config,
-              voice: overrideVoice ?? config.voice,
+              voice,
+              lang,
              outputFormat: format,
            },
            timeoutMs: req.timeoutMs,