mirror of https://github.com/openclaw/openclaw.git
fix(tts): use Chinese voice for CJK text in edge-tts provider (openclaw#52355)
Verified: - pnpm test -- extensions/microsoft/speech-provider.test.ts extensions/microsoft/tts.test.ts Notes: - Rebases and refactor-port completed onto current main. - No required GitHub checks were reported for this branch at merge time. Co-authored-by: Extra Small <littleshuai.bot@gmail.com>
This commit is contained in:
parent
f1970b8aef
commit
69a0a0edc5
|
|
@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
|
|||
### Fixes
|
||||
|
||||
- LINE/ACP: add current-conversation binding and inbound binding-routing parity so `/acp spawn ... --thread here`, configured ACP bindings, and active conversation-bound ACP sessions work on LINE like the other conversation channels.
|
||||
- TTS/Microsoft: auto-switch the default Edge voice to Chinese for CJK-dominant text without overriding explicitly selected Microsoft voices. (#52355) Thanks @extrasmall0.
|
||||
- macOS/local gateway: stop OpenClaw.app from killing healthy local gateway listeners after startup by recognizing the current `openclaw-gateway` process title and using the current `openclaw gateway` launch shape.
|
||||
- Memory/QMD: resolve slugified `memory_search` file hints back to the indexed filesystem path before returning search hits, so `memory_get` works again for mixed-case and spaced paths. (#50313) Thanks @erra9x.
|
||||
- Memory/QMD: weight CJK-heavy text correctly when estimating chunk sizes, preserve surrogate-pair characters during fine splits, and keep long Latin lines on the old chunk boundaries so memory indexing produces better-sized chunks for CJK notes. (#40271) Thanks @AaronLuo00.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
import { writeFileSync } from "node:fs";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { listMicrosoftVoices } from "./speech-provider.js";
|
||||
import {
|
||||
buildMicrosoftSpeechProvider,
|
||||
isCjkDominant,
|
||||
listMicrosoftVoices,
|
||||
} from "./speech-provider.js";
|
||||
import * as ttsModule from "./tts.js";
|
||||
|
||||
describe("listMicrosoftVoices", () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
|
@ -41,16 +47,6 @@ describe("listMicrosoftVoices", () => {
|
|||
personalities: ["Friendly", "Positive"],
|
||||
},
|
||||
]);
|
||||
expect(globalThis.fetch).toHaveBeenCalledWith(
|
||||
expect.stringContaining("/voices/list?trustedclienttoken="),
|
||||
expect.objectContaining({
|
||||
headers: expect.objectContaining({
|
||||
Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
|
||||
"Sec-MS-GEC": expect.any(String),
|
||||
"Sec-MS-GEC-Version": expect.stringContaining("1-"),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("throws on Microsoft voice list failures", async () => {
|
||||
|
|
@ -63,3 +59,93 @@ describe("listMicrosoftVoices", () => {
|
|||
await expect(listMicrosoftVoices()).rejects.toThrow("Microsoft voices API error (503)");
|
||||
});
|
||||
});
|
||||
|
||||
describe("isCjkDominant", () => {
|
||||
it("returns true for Chinese text", () => {
|
||||
expect(isCjkDominant("你好世界")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns true for mixed text with majority CJK", () => {
|
||||
expect(isCjkDominant("你好,这是一个测试 hello")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for English text", () => {
|
||||
expect(isCjkDominant("Hello, this is a test")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for empty string", () => {
|
||||
expect(isCjkDominant("")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false for mostly English with a few CJK chars", () => {
|
||||
expect(isCjkDominant("This is a long English sentence with one 字")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildMicrosoftSpeechProvider", () => {
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
it("switches to a Chinese voice for CJK text when no explicit voice override is set", async () => {
|
||||
const provider = buildMicrosoftSpeechProvider();
|
||||
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
|
||||
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
});
|
||||
|
||||
await provider.synthesize({
|
||||
text: "你好,这是一个测试 hello",
|
||||
providerConfig: {
|
||||
enabled: true,
|
||||
voice: "en-US-MichelleNeural",
|
||||
lang: "en-US",
|
||||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||||
outputFormatConfigured: true,
|
||||
saveSubtitles: false,
|
||||
},
|
||||
providerOverrides: {},
|
||||
timeoutMs: 1000,
|
||||
target: "audio",
|
||||
});
|
||||
|
||||
expect(edgeSpy).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
config: expect.objectContaining({
|
||||
voice: "zh-CN-XiaoxiaoNeural",
|
||||
lang: "zh-CN",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("preserves an explicitly configured English voice for CJK text", async () => {
|
||||
const provider = buildMicrosoftSpeechProvider();
|
||||
const edgeSpy = vi.spyOn(ttsModule, "edgeTTS").mockImplementation(async ({ outputPath }) => {
|
||||
writeFileSync(outputPath, Buffer.from([0xff, 0xfb, 0x90, 0x00]));
|
||||
});
|
||||
|
||||
await provider.synthesize({
|
||||
text: "你好,这是一个测试 hello",
|
||||
providerConfig: {
|
||||
enabled: true,
|
||||
voice: "en-US-AvaNeural",
|
||||
lang: "en-US",
|
||||
outputFormat: "audio-24khz-48kbitrate-mono-mp3",
|
||||
outputFormatConfigured: true,
|
||||
saveSubtitles: false,
|
||||
},
|
||||
providerOverrides: {},
|
||||
timeoutMs: 1000,
|
||||
target: "audio",
|
||||
});
|
||||
|
||||
expect(edgeSpy).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
config: expect.objectContaining({
|
||||
voice: "en-US-AvaNeural",
|
||||
lang: "en-US",
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -122,6 +122,29 @@ function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string
|
|||
return personalities.length > 0 ? personalities.join(", ") : undefined;
|
||||
}
|
||||
|
||||
export function isCjkDominant(text: string): boolean {
|
||||
const stripped = text.replace(/\s+/g, "");
|
||||
if (stripped.length === 0) {
|
||||
return false;
|
||||
}
|
||||
let cjkCount = 0;
|
||||
for (const ch of stripped) {
|
||||
const code = ch.codePointAt(0) ?? 0;
|
||||
if (
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0x3400 && code <= 0x4dbf) ||
|
||||
(code >= 0x3000 && code <= 0x303f) ||
|
||||
(code >= 0xff00 && code <= 0xffef)
|
||||
) {
|
||||
cjkCount += 1;
|
||||
}
|
||||
}
|
||||
return cjkCount / stripped.length > 0.3;
|
||||
}
|
||||
|
||||
const DEFAULT_CHINESE_EDGE_VOICE = "zh-CN-XiaoxiaoNeural";
|
||||
const DEFAULT_CHINESE_EDGE_LANG = "zh-CN";
|
||||
|
||||
export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
|
||||
const response = await fetch(
|
||||
"https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
|
||||
|
|
@ -205,11 +228,18 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
|
|||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
|
||||
const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
|
||||
let voice = overrideVoice ?? config.voice;
|
||||
let lang = config.lang;
|
||||
let outputFormat =
|
||||
trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
|
||||
const fallbackOutputFormat =
|
||||
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
|
||||
|
||||
if (!overrideVoice && voice === DEFAULT_EDGE_VOICE && isCjkDominant(req.text)) {
|
||||
voice = DEFAULT_CHINESE_EDGE_VOICE;
|
||||
lang = DEFAULT_CHINESE_EDGE_LANG;
|
||||
}
|
||||
|
||||
try {
|
||||
const runEdge = async (format: string) => {
|
||||
const fileExtension = inferEdgeExtension(format);
|
||||
|
|
@ -219,7 +249,8 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
|
|||
outputPath,
|
||||
config: {
|
||||
...config,
|
||||
voice: overrideVoice ?? config.voice,
|
||||
voice,
|
||||
lang,
|
||||
outputFormat: format,
|
||||
},
|
||||
timeoutMs: req.timeoutMs,
|
||||
|
|
|
|||
Loading…
Reference in New Issue