mirror of https://github.com/openclaw/openclaw.git
213 lines
6.0 KiB
TypeScript
213 lines
6.0 KiB
TypeScript
import { pcmToMulaw } from "../telephony-audio.js";
|
|
|
|
/**
|
|
* OpenAI TTS Provider
|
|
*
|
|
* Generates speech audio using OpenAI's text-to-speech API.
|
|
* Handles audio format conversion for telephony (mu-law 8kHz).
|
|
*
|
|
* Best practices from OpenAI docs:
|
|
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
|
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
|
* - Use marin or cedar voices for best quality
|
|
* - Use pcm or wav format for fastest response times
|
|
*
|
|
* @see https://platform.openai.com/docs/guides/text-to-speech
|
|
*/
|
|
|
|
/**
|
|
* OpenAI TTS configuration.
|
|
*/
|
|
export interface OpenAITTSConfig {
|
|
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
|
apiKey?: string;
|
|
/**
|
|
* TTS model:
|
|
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
|
* - tts-1: lower latency
|
|
* - tts-1-hd: higher quality
|
|
*/
|
|
model?: string;
|
|
/**
|
|
* Voice to use. For best quality, use marin or cedar.
|
|
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
|
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
|
*/
|
|
voice?: string;
|
|
/** Speed multiplier (0.25 to 4.0) */
|
|
speed?: number;
|
|
/**
|
|
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
|
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
|
*/
|
|
instructions?: string;
|
|
}
|
|
|
|
/**
|
|
* Supported OpenAI TTS voices (all 13 built-in voices).
|
|
* For best quality, use marin or cedar.
|
|
* Note: tts-1 and tts-1-hd support a smaller set.
|
|
*/
|
|
export const OPENAI_TTS_VOICES = [
|
|
"alloy",
|
|
"ash",
|
|
"ballad",
|
|
"coral",
|
|
"echo",
|
|
"fable",
|
|
"nova",
|
|
"onyx",
|
|
"sage",
|
|
"shimmer",
|
|
"verse",
|
|
"marin",
|
|
"cedar",
|
|
] as const;
|
|
|
|
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
|
|
|
/**
|
|
* OpenAI TTS Provider for generating speech audio.
|
|
*/
|
|
export class OpenAITTSProvider {
|
|
private apiKey: string;
|
|
private model: string;
|
|
private voice: OpenAITTSVoice;
|
|
private speed: number;
|
|
private instructions?: string;
|
|
|
|
constructor(config: OpenAITTSConfig = {}) {
|
|
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
|
|
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
|
this.model = config.model || "gpt-4o-mini-tts";
|
|
// Default to coral - good balance of quality and natural tone
|
|
this.voice = (config.voice as OpenAITTSVoice) || "coral";
|
|
this.speed = config.speed || 1.0;
|
|
this.instructions = config.instructions;
|
|
|
|
if (!this.apiKey) {
|
|
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate speech audio from text.
|
|
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
|
*/
|
|
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
|
// Build request body
|
|
const body: Record<string, unknown> = {
|
|
model: this.model,
|
|
input: text,
|
|
voice: this.voice,
|
|
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
|
speed: this.speed,
|
|
};
|
|
|
|
// Add instructions if using gpt-4o-mini-tts model
|
|
const effectiveInstructions = instructions || this.instructions;
|
|
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
|
body.instructions = effectiveInstructions;
|
|
}
|
|
|
|
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
method: "POST",
|
|
headers: {
|
|
Authorization: `Bearer ${this.apiKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify(body),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const error = await response.text();
|
|
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
|
}
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
return Buffer.from(arrayBuffer);
|
|
}
|
|
|
|
/**
|
|
* Generate speech and convert to mu-law format for Twilio.
|
|
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
|
*/
|
|
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
|
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
|
const pcm24k = await this.synthesize(text);
|
|
|
|
// Resample from 24kHz to 8kHz
|
|
const pcm8k = resample24kTo8k(pcm24k);
|
|
|
|
// Encode to mu-law
|
|
return pcmToMulaw(pcm8k);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Resample 24kHz PCM to 8kHz using linear interpolation.
|
|
* Input/output: 16-bit signed little-endian mono.
|
|
*/
|
|
function resample24kTo8k(input: Buffer): Buffer {
|
|
const inputSamples = input.length / 2;
|
|
const outputSamples = Math.floor(inputSamples / 3);
|
|
const output = Buffer.alloc(outputSamples * 2);
|
|
|
|
for (let i = 0; i < outputSamples; i++) {
|
|
// Calculate position in input (3:1 ratio)
|
|
const srcPos = i * 3;
|
|
const srcIdx = srcPos * 2;
|
|
|
|
if (srcIdx + 3 < input.length) {
|
|
// Linear interpolation between samples
|
|
const s0 = input.readInt16LE(srcIdx);
|
|
const s1 = input.readInt16LE(srcIdx + 2);
|
|
const frac = srcPos % 1 || 0;
|
|
const sample = Math.round(s0 + frac * (s1 - s0));
|
|
output.writeInt16LE(clamp16(sample), i * 2);
|
|
} else {
|
|
// Last sample
|
|
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
|
|
}
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
* Clamp value to 16-bit signed integer range.
|
|
*/
|
|
function clamp16(value: number): number {
|
|
return Math.max(-32768, Math.min(32767, value));
|
|
}
|
|
|
|
/**
|
|
* Convert 8-bit mu-law to 16-bit linear PCM.
|
|
* Useful for decoding incoming audio.
|
|
*/
|
|
export function mulawToLinear(mulaw: number): number {
|
|
// mu-law is transmitted inverted
|
|
mulaw = ~mulaw & 0xff;
|
|
|
|
const sign = mulaw & 0x80;
|
|
const exponent = (mulaw >> 4) & 0x07;
|
|
const mantissa = mulaw & 0x0f;
|
|
|
|
let sample = ((mantissa << 3) + 132) << exponent;
|
|
sample -= 132;
|
|
|
|
return sign ? -sample : sample;
|
|
}
|
|
|
|
/**
|
|
* Chunk audio buffer into 20ms frames for streaming.
|
|
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
|
*/
|
|
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
|
|
return (function* () {
|
|
for (let i = 0; i < audio.length; i += chunkSize) {
|
|
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
|
}
|
|
})();
|
|
}
|