openclaw/extensions/voice-call/src/providers/tts-openai.ts

import { pcmToMulaw } from "../telephony-audio.js";

/**
 * OpenAI TTS Provider
 *
 * Generates speech audio using OpenAI's text-to-speech API.
 * Handles audio format conversion for telephony (mu-law 8kHz).
 *
 * Best practices from OpenAI docs:
 * - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
 * - Use tts-1 for lower latency, tts-1-hd for higher quality
 * - Use marin or cedar voices for best quality
 * - Use pcm or wav format for fastest response times
 *
 * @see https://platform.openai.com/docs/guides/text-to-speech
 */

/**
 * OpenAI TTS configuration.
 */
export interface OpenAITTSConfig {
  /** OpenAI API key (uses OPENAI_API_KEY env if not set) */
  apiKey?: string;
  /**
   * TTS model:
   * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
   * - tts-1: lower latency
   * - tts-1-hd: higher quality
   */
  model?: string;
  /**
   * Voice to use. For best quality, use marin or cedar.
   * All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
   * Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
   */
  voice?: string;
  /** Speed multiplier (0.25 to 4.0) */
  speed?: number;
  /**
   * Instructions for speech style (only works with gpt-4o-mini-tts model).
   * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
   */
  instructions?: string;
}

/**
 * Supported OpenAI TTS voices (all 13 built-in voices).
 * For best quality, use marin or cedar.
 * Note: tts-1 and tts-1-hd support a smaller set.
 */
export const OPENAI_TTS_VOICES = [
  "alloy",
  "ash",
  "ballad",
  "coral",
  "echo",
  "fable",
  "nova",
  "onyx",
  "sage",
  "shimmer",
  "verse",
  "marin",
  "cedar",
] as const;

export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];

/**
 * OpenAI TTS Provider for generating speech audio.
 */
export class OpenAITTSProvider {
  private apiKey: string;
  private model: string;
  private voice: OpenAITTSVoice;
  private speed: number;
  private instructions?: string;

  constructor(config: OpenAITTSConfig = {}) {
    this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
    // Default to gpt-4o-mini-tts for intelligent realtime applications
    this.model = config.model || "gpt-4o-mini-tts";
    // Default to coral - good balance of quality and natural tone
    this.voice = (config.voice as OpenAITTSVoice) || "coral";
    this.speed = config.speed || 1.0;
    this.instructions = config.instructions;

    if (!this.apiKey) {
      throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
    }
  }

  /**
   * Generate speech audio from text.
   * Returns raw PCM audio data (24kHz, mono, 16-bit).
   */
  async synthesize(text: string, instructions?: string): Promise<Buffer> {
    // Build request body
    const body: Record<string, unknown> = {
      model: this.model,
      input: text,
      voice: this.voice,
      response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
      speed: this.speed,
    };

    // Add instructions if using gpt-4o-mini-tts model
    const effectiveInstructions = instructions || this.instructions;
    if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
      body.instructions = effectiveInstructions;
    }

    const response = await fetch("https://api.openai.com/v1/audio/speech", {
      method: "POST",
      headers: {
        Authorization: `Bearer ${this.apiKey}`,
        "Content-Type": "application/json",
      },
      body: JSON.stringify(body),
    });

    if (!response.ok) {
      const error = await response.text();
      throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
    }

    const arrayBuffer = await response.arrayBuffer();
    return Buffer.from(arrayBuffer);
  }

  /**
   * Generate speech and convert to mu-law format for Twilio.
   * Twilio Media Streams expect 8kHz mono mu-law audio.
   */
  async synthesizeForTwilio(text: string): Promise<Buffer> {
    // Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
    const pcm24k = await this.synthesize(text);

    // Resample from 24kHz to 8kHz
    const pcm8k = resample24kTo8k(pcm24k);

    // Encode to mu-law
    return pcmToMulaw(pcm8k);
  }
}

/**
 * Resample 24kHz PCM to 8kHz using linear interpolation.
 * Input/output: 16-bit signed little-endian mono.
 */
function resample24kTo8k(input: Buffer): Buffer {
  const inputSamples = input.length / 2;
  const outputSamples = Math.floor(inputSamples / 3);
  const output = Buffer.alloc(outputSamples * 2);

  for (let i = 0; i < outputSamples; i++) {
    // Calculate position in input (3:1 ratio)
    const srcPos = i * 3;
    const srcIdx = srcPos * 2;

    if (srcIdx + 3 < input.length) {
      // Linear interpolation between samples
      const s0 = input.readInt16LE(srcIdx);
      const s1 = input.readInt16LE(srcIdx + 2);
      const frac = srcPos % 1 || 0;
      const sample = Math.round(s0 + frac * (s1 - s0));
      output.writeInt16LE(clamp16(sample), i * 2);
    } else {
      // Last sample
      output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
    }
  }

  return output;
}

/**
 * Clamp value to 16-bit signed integer range.
 */
function clamp16(value: number): number {
  return Math.max(-32768, Math.min(32767, value));
}

/**
 * Convert 8-bit mu-law to 16-bit linear PCM.
 * Useful for decoding incoming audio.
 */
export function mulawToLinear(mulaw: number): number {
  // mu-law is transmitted inverted
  mulaw = ~mulaw & 0xff;

  const sign = mulaw & 0x80;
  const exponent = (mulaw >> 4) & 0x07;
  const mantissa = mulaw & 0x0f;

  let sample = ((mantissa << 3) + 132) << exponent;
  sample -= 132;

  return sign ? -sample : sample;
}

/**
 * Chunk audio buffer into 20ms frames for streaming.
 * At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
 */
export function chunkAudio(audio: Buffer, chunkSize = 160): Generator<Buffer, void, unknown> {
  return (function* () {
    for (let i = 0; i < audio.length; i += chunkSize) {
      yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
    }
  })();
}