openclaw/extensions/voice-call/src/response-generator.ts

283 lines
8.3 KiB
TypeScript

/**
* Voice call response generator - uses the embedded Pi agent for tool support.
* Routes voice responses through the same agent infrastructure as messaging.
*/
import crypto from "node:crypto";
import type { SessionEntry } from "../api.js";
import type { VoiceCallConfig } from "./config.js";
import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
export type VoiceResponseParams = {
/** Voice call config */
voiceConfig: VoiceCallConfig;
/** Core OpenClaw config */
coreConfig: CoreConfig;
/** Injected host agent runtime */
agentRuntime: CoreAgentDeps;
/** Call ID for session tracking */
callId: string;
/** Caller's phone number */
from: string;
/** Conversation transcript */
transcript: Array<{ speaker: "user" | "bot"; text: string }>;
/** Latest user message */
userMessage: string;
};
export type VoiceResponseResult = {
text: string | null;
error?: string;
};
type VoiceResponsePayload = {
text?: string;
isError?: boolean;
isReasoning?: boolean;
};
const VOICE_SPOKEN_OUTPUT_CONTRACT = [
"Output format requirements:",
'- Return only valid JSON in this exact shape: {"spoken":"..."}',
"- Do not include markdown, code fences, planning text, or extra keys.",
'- Put exactly what should be spoken to the caller into "spoken".',
'- If there is nothing to say, return {"spoken":""}.',
].join("\n");
function normalizeSpokenText(value: string): string | null {
const normalized = value.replace(/\s+/g, " ").trim();
return normalized.length > 0 ? normalized : null;
}
function tryParseSpokenJson(text: string): string | null {
const candidates: string[] = [];
const trimmed = text.trim();
if (!trimmed) {
return null;
}
candidates.push(trimmed);
const fenced = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
if (fenced?.[1]) {
candidates.push(fenced[1]);
}
const firstBrace = trimmed.indexOf("{");
const lastBrace = trimmed.lastIndexOf("}");
if (firstBrace >= 0 && lastBrace > firstBrace) {
candidates.push(trimmed.slice(firstBrace, lastBrace + 1));
}
for (const candidate of candidates) {
try {
const parsed = JSON.parse(candidate) as { spoken?: unknown };
if (typeof parsed?.spoken !== "string") {
continue;
}
return normalizeSpokenText(parsed.spoken) ?? "";
} catch {
// Continue trying other candidates.
}
}
const inlineSpokenMatch = trimmed.match(/"spoken"\s*:\s*"((?:[^"\\]|\\.)*)"/i);
if (!inlineSpokenMatch) {
return null;
}
try {
const decoded = JSON.parse(`"${inlineSpokenMatch[1] ?? ""}"`) as string;
return normalizeSpokenText(decoded) ?? "";
} catch {
return null;
}
}
function isLikelyMetaReasoningParagraph(paragraph: string): boolean {
const lower = paragraph.toLowerCase();
if (!lower) {
return false;
}
if (lower.startsWith("thinking process")) {
return true;
}
if (lower.startsWith("reasoning:") || lower.startsWith("analysis:")) {
return true;
}
if (
lower.startsWith("the user ") &&
(lower.includes("i should") || lower.includes("i need to") || lower.includes("i will"))
) {
return true;
}
if (
lower.includes("this is a natural continuation of the conversation") ||
lower.includes("keep the conversation flowing")
) {
return true;
}
return false;
}
function sanitizePlainSpokenText(text: string): string | null {
const withoutCodeFences = text.replace(/```[\s\S]*?```/g, " ").trim();
if (!withoutCodeFences) {
return null;
}
const paragraphs = withoutCodeFences
.split(/\n\s*\n+/)
.map((paragraph) => paragraph.trim())
.filter(Boolean);
while (paragraphs.length > 1 && isLikelyMetaReasoningParagraph(paragraphs[0])) {
paragraphs.shift();
}
return normalizeSpokenText(paragraphs.join(" "));
}
function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string | null {
const spokenSegments: string[] = [];
for (const payload of payloads) {
if (payload.isError || payload.isReasoning) {
continue;
}
const rawText = payload.text?.trim() ?? "";
if (!rawText) {
continue;
}
const structured = tryParseSpokenJson(rawText);
if (structured !== null) {
if (structured.length > 0) {
spokenSegments.push(structured);
}
continue;
}
const plain = sanitizePlainSpokenText(rawText);
if (plain) {
spokenSegments.push(plain);
}
}
return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
}
/**
* Generate a voice response using the embedded Pi agent with full tool support.
* Uses the same agent infrastructure as messaging for consistent behavior.
*/
export async function generateVoiceResponse(
params: VoiceResponseParams,
): Promise<VoiceResponseResult> {
const { voiceConfig, callId, from, transcript, userMessage, coreConfig, agentRuntime } = params;
if (!coreConfig) {
return { text: null, error: "Core config unavailable for voice response" };
}
const cfg = coreConfig;
// Build voice-specific session key based on phone number
const normalizedPhone = from.replace(/\D/g, "");
const sessionKey = `voice:${normalizedPhone}`;
const agentId = "main";
// Resolve paths
const storePath = agentRuntime.session.resolveStorePath(cfg.session?.store, { agentId });
const agentDir = agentRuntime.resolveAgentDir(cfg, agentId);
const workspaceDir = agentRuntime.resolveAgentWorkspaceDir(cfg, agentId);
// Ensure workspace exists
await agentRuntime.ensureAgentWorkspace({ dir: workspaceDir });
// Load or create session entry
const sessionStore = agentRuntime.session.loadSessionStore(storePath);
const now = Date.now();
let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
if (!sessionEntry) {
sessionEntry = {
sessionId: crypto.randomUUID(),
updatedAt: now,
};
sessionStore[sessionKey] = sessionEntry;
await agentRuntime.session.saveSessionStore(storePath, sessionStore);
}
const sessionId = sessionEntry.sessionId;
const sessionFile = agentRuntime.session.resolveSessionFilePath(sessionId, sessionEntry, {
agentId,
});
// Resolve model from config
const modelRef =
voiceConfig.responseModel || `${agentRuntime.defaults.provider}/${agentRuntime.defaults.model}`;
const slashIndex = modelRef.indexOf("/");
const provider =
slashIndex === -1 ? agentRuntime.defaults.provider : modelRef.slice(0, slashIndex);
const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
// Resolve thinking level
const thinkLevel = agentRuntime.resolveThinkingDefault({ cfg, provider, model });
// Resolve agent identity for personalized prompt
const identity = agentRuntime.resolveAgentIdentity(cfg, agentId);
const agentName = identity?.name?.trim() || "assistant";
// Build system prompt with conversation history
const basePrompt =
voiceConfig.responseSystemPrompt ??
`You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
let extraSystemPrompt = basePrompt;
if (transcript.length > 0) {
const history = transcript
.map((entry) => `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`)
.join("\n");
extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
}
extraSystemPrompt = `${extraSystemPrompt}\n\n${VOICE_SPOKEN_OUTPUT_CONTRACT}`;
// Resolve timeout
const timeoutMs = voiceConfig.responseTimeoutMs ?? agentRuntime.resolveAgentTimeoutMs({ cfg });
const runId = `voice:${callId}:${Date.now()}`;
try {
const result = await agentRuntime.runEmbeddedPiAgent({
sessionId,
sessionKey,
messageProvider: "voice",
sessionFile,
workspaceDir,
config: cfg,
prompt: userMessage,
provider,
model,
thinkLevel,
verboseLevel: "off",
timeoutMs,
runId,
lane: "voice",
extraSystemPrompt,
agentDir,
});
const text = extractSpokenTextFromPayloads((result.payloads ?? []) as VoiceResponsePayload[]);
if (!text && result.meta?.aborted) {
return { text: null, error: "Response generation was aborted" };
}
return { text };
} catch (err) {
console.error(`[voice-call] Response generation failed:`, err);
return { text: null, error: String(err) };
}
}