mirror of https://github.com/openclaw/openclaw.git
feat(tts): add in-memory speech synthesis
This commit is contained in:
parent
b36e456b09
commit
84ee6fbb76
|
|
@ -96,6 +96,7 @@ export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
|
|||
outputPath,
|
||||
config: {
|
||||
...req.config.edge,
|
||||
voice: req.overrides?.microsoft?.voice ?? req.config.edge.voice,
|
||||
outputFormat: format,
|
||||
},
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ export function buildOpenAISpeechProvider(): SpeechProviderPlugin {
|
|||
baseUrl: req.config.openai.baseUrl,
|
||||
model: req.overrides?.openai?.model ?? req.config.openai.model,
|
||||
voice: req.overrides?.openai?.voice ?? req.config.openai.voice,
|
||||
speed: req.config.openai.speed,
|
||||
speed: req.overrides?.openai?.speed ?? req.config.openai.speed,
|
||||
instructions: req.config.openai.instructions,
|
||||
responseFormat,
|
||||
timeoutMs: req.config.timeoutMs,
|
||||
|
|
|
|||
|
|
@ -162,6 +162,7 @@ export type TtsDirectiveOverrides = {
|
|||
openai?: {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
speed?: number;
|
||||
};
|
||||
elevenlabs?: {
|
||||
voiceId?: string;
|
||||
|
|
@ -171,6 +172,9 @@ export type TtsDirectiveOverrides = {
|
|||
languageCode?: string;
|
||||
voiceSettings?: Partial<ResolvedTtsConfig["elevenlabs"]["voiceSettings"]>;
|
||||
};
|
||||
microsoft?: {
|
||||
voice?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type TtsDirectiveParseResult = {
|
||||
|
|
@ -191,6 +195,17 @@ export type TtsResult = {
|
|||
voiceCompatible?: boolean;
|
||||
};
|
||||
|
||||
export type TtsSynthesisResult = {
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
error?: string;
|
||||
latencyMs?: number;
|
||||
provider?: string;
|
||||
outputFormat?: string;
|
||||
voiceCompatible?: boolean;
|
||||
fileExtension?: string;
|
||||
};
|
||||
|
||||
export type TtsTelephonyResult = {
|
||||
success: boolean;
|
||||
audioBuffer?: Buffer;
|
||||
|
|
@ -601,6 +616,7 @@ function resolveTtsRequestSetup(params: {
|
|||
cfg: OpenClawConfig;
|
||||
prefsPath?: string;
|
||||
providerOverride?: TtsProvider;
|
||||
disableFallback?: boolean;
|
||||
}):
|
||||
| {
|
||||
config: ResolvedTtsConfig;
|
||||
|
|
@ -621,7 +637,7 @@ function resolveTtsRequestSetup(params: {
|
|||
const provider = normalizeSpeechProviderId(params.providerOverride) ?? userProvider;
|
||||
return {
|
||||
config,
|
||||
providers: resolveTtsProviderOrder(provider, params.cfg),
|
||||
providers: params.disableFallback ? [provider] : resolveTtsProviderOrder(provider, params.cfg),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -631,12 +647,44 @@ export async function textToSpeech(params: {
|
|||
prefsPath?: string;
|
||||
channel?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
disableFallback?: boolean;
|
||||
}): Promise<TtsResult> {
|
||||
const synthesis = await synthesizeSpeech(params);
|
||||
if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) {
|
||||
return buildTtsFailureResult([synthesis.error ?? "TTS conversion failed"]);
|
||||
}
|
||||
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
|
||||
writeFileSync(audioPath, synthesis.audioBuffer);
|
||||
scheduleCleanup(tempDir);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioPath,
|
||||
latencyMs: synthesis.latencyMs,
|
||||
provider: synthesis.provider,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
};
|
||||
}
|
||||
|
||||
export async function synthesizeSpeech(params: {
|
||||
text: string;
|
||||
cfg: OpenClawConfig;
|
||||
prefsPath?: string;
|
||||
channel?: string;
|
||||
overrides?: TtsDirectiveOverrides;
|
||||
disableFallback?: boolean;
|
||||
}): Promise<TtsSynthesisResult> {
|
||||
const setup = resolveTtsRequestSetup({
|
||||
text: params.text,
|
||||
cfg: params.cfg,
|
||||
prefsPath: params.prefsPath,
|
||||
providerOverride: params.overrides?.provider,
|
||||
disableFallback: params.disableFallback,
|
||||
});
|
||||
if ("error" in setup) {
|
||||
return { success: false, error: setup.error };
|
||||
|
|
@ -667,22 +715,14 @@ export async function textToSpeech(params: {
|
|||
target,
|
||||
overrides: params.overrides,
|
||||
});
|
||||
const latencyMs = Date.now() - providerStart;
|
||||
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = mkdtempSync(path.join(tempRoot, "tts-"));
|
||||
const audioPath = path.join(tempDir, `voice-${Date.now()}${synthesis.fileExtension}`);
|
||||
writeFileSync(audioPath, synthesis.audioBuffer);
|
||||
scheduleCleanup(tempDir);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
audioPath,
|
||||
latencyMs,
|
||||
audioBuffer: synthesis.audioBuffer,
|
||||
latencyMs: Date.now() - providerStart,
|
||||
provider,
|
||||
outputFormat: synthesis.outputFormat,
|
||||
voiceCompatible: synthesis.voiceCompatible,
|
||||
fileExtension: synthesis.fileExtension,
|
||||
};
|
||||
} catch (err) {
|
||||
errors.push(formatTtsProviderError(provider, err));
|
||||
|
|
|
|||
Loading…
Reference in New Issue