import { constants as fsConstants } from "node:fs"; import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { hasAvailableAuthForProvider } from "../agents/model-auth.js"; import { findModelInCatalog, loadModelCatalog, modelSupportsVision, } from "../agents/model-catalog.js"; import type { MsgContext } from "../auto-reply/templating.js"; import type { OpenClawConfig } from "../config/config.js"; import { resolveAgentModelFallbackValues, resolveAgentModelPrimaryValue, } from "../config/model-input.js"; import type { MediaUnderstandingConfig, MediaUnderstandingModelConfig, } from "../config/types.tools.js"; import { logVerbose, shouldLogVerbose } from "../globals.js"; import { mergeInboundPathRoots, resolveIMessageAttachmentRoots, } from "../media/inbound-path-policy.js"; import { getDefaultMediaLocalRoots } from "../media/local-roots.js"; import { runExec } from "../process/exec.js"; import { MediaAttachmentCache, type MediaAttachmentCacheOptions, normalizeAttachments, selectAttachments, } from "./attachments.js"; import { AUTO_AUDIO_KEY_PROVIDERS, AUTO_IMAGE_KEY_PROVIDERS, AUTO_VIDEO_KEY_PROVIDERS, DEFAULT_IMAGE_MODELS, } from "./defaults.js"; import { isMediaUnderstandingSkipError } from "./errors.js"; import { fileExists } from "./fs.js"; import { extractGeminiResponse } from "./output-extract.js"; import { buildMediaUnderstandingRegistry, getMediaUnderstandingProvider, normalizeMediaProviderId, } from "./provider-registry.js"; import { resolveModelEntries, resolveScopeDecision } from "./resolve.js"; import { buildModelDecision, formatDecisionSummary, runCliEntry, runProviderEntry, } from "./runner.entries.js"; import type { MediaAttachment, MediaUnderstandingCapability, MediaUnderstandingDecision, MediaUnderstandingModelDecision, MediaUnderstandingOutput, MediaUnderstandingProvider, } from "./types.js"; export type ActiveMediaModel = { provider: string; model?: string; }; type ProviderRegistry = Map; export type RunCapabilityResult = { outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision; }; export function buildProviderRegistry( overrides?: Record, cfg?: OpenClawConfig, ): ProviderRegistry { return buildMediaUnderstandingRegistry(overrides, cfg); } export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] { return normalizeAttachments(ctx); } export function resolveMediaAttachmentLocalRoots(params: { cfg: OpenClawConfig; ctx: MsgContext; }): readonly string[] { return mergeInboundPathRoots( getDefaultMediaLocalRoots(), resolveIMessageAttachmentRoots({ cfg: params.cfg, accountId: params.ctx.AccountId, }), ); } export function createMediaAttachmentCache( attachments: MediaAttachment[], options?: MediaAttachmentCacheOptions, ): MediaAttachmentCache { return new MediaAttachmentCache(attachments, options); } const binaryCache = new Map>(); const geminiProbeCache = new Map>(); export function clearMediaUnderstandingBinaryCacheForTests(): void { binaryCache.clear(); geminiProbeCache.clear(); } function expandHomeDir(value: string): string { if (!value.startsWith("~")) { return value; } const home = os.homedir(); if (value === "~") { return home; } if (value.startsWith("~/")) { return path.join(home, value.slice(2)); } return value; } function hasPathSeparator(value: string): boolean { return value.includes("/") || value.includes("\\"); } function candidateBinaryNames(name: string): string[] { if (process.platform !== "win32") { return [name]; } const ext = path.extname(name); if (ext) { return [name]; } const pathext = (process.env.PATHEXT ?? ".EXE;.CMD;.BAT;.COM") .split(";") .map((item) => item.trim()) .filter(Boolean) .map((item) => (item.startsWith(".") ? item : `.${item}`)); const unique = Array.from(new Set(pathext)); return [name, ...unique.map((item) => `${name}${item}`)]; } async function isExecutable(filePath: string): Promise { try { const stat = await fs.stat(filePath); if (!stat.isFile()) { return false; } if (process.platform === "win32") { return true; } await fs.access(filePath, fsConstants.X_OK); return true; } catch { return false; } } async function findBinary(name: string): Promise { const cached = binaryCache.get(name); if (cached) { return cached; } const resolved = (async () => { const direct = expandHomeDir(name.trim()); if (direct && hasPathSeparator(direct)) { for (const candidate of candidateBinaryNames(direct)) { if (await isExecutable(candidate)) { return candidate; } } } const searchName = name.trim(); if (!searchName) { return null; } const pathEntries = (process.env.PATH ?? "").split(path.delimiter); const candidates = candidateBinaryNames(searchName); for (const entryRaw of pathEntries) { const entry = expandHomeDir(entryRaw.trim().replace(/^"(.*)"$/, "$1")); if (!entry) { continue; } for (const candidate of candidates) { const fullPath = path.join(entry, candidate); if (await isExecutable(fullPath)) { return fullPath; } } } return null; })(); binaryCache.set(name, resolved); return resolved; } async function hasBinary(name: string): Promise { return Boolean(await findBinary(name)); } async function probeGeminiCli(): Promise { const cached = geminiProbeCache.get("gemini"); if (cached) { return cached; } const resolved = (async () => { if (!(await hasBinary("gemini"))) { return false; } try { const { stdout } = await runExec("gemini", ["--output-format", "json", "ok"], { timeoutMs: 8000, }); return Boolean(extractGeminiResponse(stdout) ?? stdout.toLowerCase().includes("ok")); } catch { return false; } })(); geminiProbeCache.set("gemini", resolved); return resolved; } async function resolveLocalWhisperCppEntry(): Promise { if (!(await hasBinary("whisper-cli"))) { return null; } const envModel = process.env.WHISPER_CPP_MODEL?.trim(); const defaultModel = "/opt/homebrew/share/whisper-cpp/for-tests-ggml-tiny.bin"; const modelPath = envModel && (await fileExists(envModel)) ? envModel : defaultModel; if (!(await fileExists(modelPath))) { return null; } return { type: "cli", command: "whisper-cli", args: ["-m", modelPath, "-otxt", "-of", "{{OutputBase}}", "-np", "-nt", "{{MediaPath}}"], }; } async function resolveLocalWhisperEntry(): Promise { if (!(await hasBinary("whisper"))) { return null; } return { type: "cli", command: "whisper", args: [ "--model", "turbo", "--output_format", "txt", "--output_dir", "{{OutputDir}}", "--verbose", "False", "{{MediaPath}}", ], }; } async function resolveSherpaOnnxEntry(): Promise { if (!(await hasBinary("sherpa-onnx-offline"))) { return null; } const modelDir = process.env.SHERPA_ONNX_MODEL_DIR?.trim(); if (!modelDir) { return null; } const tokens = path.join(modelDir, "tokens.txt"); const encoder = path.join(modelDir, "encoder.onnx"); const decoder = path.join(modelDir, "decoder.onnx"); const joiner = path.join(modelDir, "joiner.onnx"); if (!(await fileExists(tokens))) { return null; } if (!(await fileExists(encoder))) { return null; } if (!(await fileExists(decoder))) { return null; } if (!(await fileExists(joiner))) { return null; } return { type: "cli", command: "sherpa-onnx-offline", args: [ `--tokens=${tokens}`, `--encoder=${encoder}`, `--decoder=${decoder}`, `--joiner=${joiner}`, "{{MediaPath}}", ], }; } async function resolveLocalAudioEntry(): Promise { const sherpa = await resolveSherpaOnnxEntry(); if (sherpa) { return sherpa; } const whisperCpp = await resolveLocalWhisperCppEntry(); if (whisperCpp) { return whisperCpp; } return await resolveLocalWhisperEntry(); } async function resolveGeminiCliEntry( _capability: MediaUnderstandingCapability, ): Promise { if (!(await probeGeminiCli())) { return null; } return { type: "cli", command: "gemini", args: [ "--output-format", "json", "--allowed-tools", "read_many_files", "--include-directories", "{{MediaDir}}", "{{Prompt}}", "Use read_many_files to read {{MediaPath}} and respond with only the text output.", ], }; } async function resolveKeyEntry(params: { cfg: OpenClawConfig; agentDir?: string; providerRegistry: ProviderRegistry; capability: MediaUnderstandingCapability; activeModel?: ActiveMediaModel; }): Promise { const { cfg, agentDir, providerRegistry, capability } = params; const checkProvider = async ( providerId: string, model?: string, ): Promise => { const provider = getMediaUnderstandingProvider(providerId, providerRegistry); if (!provider) { return null; } if (capability === "audio" && !provider.transcribeAudio) { return null; } if (capability === "image" && !provider.describeImage) { return null; } if (capability === "video" && !provider.describeVideo) { return null; } if ( !(await hasAvailableAuthForProvider({ provider: providerId, cfg, agentDir, })) ) { return null; } return { type: "provider" as const, provider: providerId, model }; }; if (capability === "image") { const activeProvider = params.activeModel?.provider?.trim(); if (activeProvider) { const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); if (activeEntry) { return activeEntry; } } for (const providerId of AUTO_IMAGE_KEY_PROVIDERS) { const model = DEFAULT_IMAGE_MODELS[providerId]; const entry = await checkProvider(providerId, model); if (entry) { return entry; } } return null; } if (capability === "video") { const activeProvider = params.activeModel?.provider?.trim(); if (activeProvider) { const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); if (activeEntry) { return activeEntry; } } for (const providerId of AUTO_VIDEO_KEY_PROVIDERS) { const entry = await checkProvider(providerId, undefined); if (entry) { return entry; } } return null; } const activeProvider = params.activeModel?.provider?.trim(); if (activeProvider) { const activeEntry = await checkProvider(activeProvider, params.activeModel?.model); if (activeEntry) { return activeEntry; } } for (const providerId of AUTO_AUDIO_KEY_PROVIDERS) { const entry = await checkProvider(providerId, undefined); if (entry) { return entry; } } return null; } function resolveImageModelFromAgentDefaults(cfg: OpenClawConfig): MediaUnderstandingModelConfig[] { const refs: string[] = []; const primary = resolveAgentModelPrimaryValue(cfg.agents?.defaults?.imageModel); if (primary?.trim()) { refs.push(primary.trim()); } for (const fb of resolveAgentModelFallbackValues(cfg.agents?.defaults?.imageModel)) { if (fb?.trim()) { refs.push(fb.trim()); } } if (refs.length === 0) { return []; } const entries: MediaUnderstandingModelConfig[] = []; for (const ref of refs) { const slashIdx = ref.indexOf("/"); if (slashIdx <= 0 || slashIdx >= ref.length - 1) { continue; } entries.push({ type: "provider", provider: ref.slice(0, slashIdx), model: ref.slice(slashIdx + 1), }); } return entries; } async function resolveAutoEntries(params: { cfg: OpenClawConfig; agentDir?: string; providerRegistry: ProviderRegistry; capability: MediaUnderstandingCapability; activeModel?: ActiveMediaModel; }): Promise { const activeEntry = await resolveActiveModelEntry(params); if (activeEntry) { return [activeEntry]; } if (params.capability === "audio") { const localAudio = await resolveLocalAudioEntry(); if (localAudio) { return [localAudio]; } } if (params.capability === "image") { const imageModelEntries = resolveImageModelFromAgentDefaults(params.cfg); if (imageModelEntries.length > 0) { return imageModelEntries; } } const gemini = await resolveGeminiCliEntry(params.capability); if (gemini) { return [gemini]; } const keys = await resolveKeyEntry(params); if (keys) { return [keys]; } return []; } export async function resolveAutoImageModel(params: { cfg: OpenClawConfig; agentDir?: string; activeModel?: ActiveMediaModel; }): Promise { const providerRegistry = buildProviderRegistry(undefined, params.cfg); const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => { if (!entry || entry.type === "cli") { return null; } const provider = entry.provider; if (!provider) { return null; } const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider]; if (!model) { return null; } return { provider, model }; }; const activeEntry = await resolveActiveModelEntry({ cfg: params.cfg, agentDir: params.agentDir, providerRegistry, capability: "image", activeModel: params.activeModel, }); const resolvedActive = toActive(activeEntry); if (resolvedActive) { return resolvedActive; } const keyEntry = await resolveKeyEntry({ cfg: params.cfg, agentDir: params.agentDir, providerRegistry, capability: "image", activeModel: params.activeModel, }); return toActive(keyEntry); } async function resolveActiveModelEntry(params: { cfg: OpenClawConfig; agentDir?: string; providerRegistry: ProviderRegistry; capability: MediaUnderstandingCapability; activeModel?: ActiveMediaModel; }): Promise { const activeProviderRaw = params.activeModel?.provider?.trim(); if (!activeProviderRaw) { return null; } const providerId = normalizeMediaProviderId(activeProviderRaw); if (!providerId) { return null; } const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry); if (!provider) { return null; } if (params.capability === "audio" && !provider.transcribeAudio) { return null; } if (params.capability === "image" && !provider.describeImage) { return null; } if (params.capability === "video" && !provider.describeVideo) { return null; } const hasAuth = await hasAvailableAuthForProvider({ provider: providerId, cfg: params.cfg, agentDir: params.agentDir, }); if (!hasAuth) { return null; } return { type: "provider", provider: providerId, model: params.activeModel?.model, }; } async function runAttachmentEntries(params: { capability: MediaUnderstandingCapability; cfg: OpenClawConfig; ctx: MsgContext; attachmentIndex: number; agentDir?: string; providerRegistry: ProviderRegistry; cache: MediaAttachmentCache; entries: MediaUnderstandingModelConfig[]; config?: MediaUnderstandingConfig; }): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[]; }> { const { entries, capability } = params; const attempts: MediaUnderstandingModelDecision[] = []; for (const entry of entries) { const entryType = entry.type ?? (entry.command ? "cli" : "provider"); try { const result = entryType === "cli" ? await runCliEntry({ capability, entry, cfg: params.cfg, ctx: params.ctx, attachmentIndex: params.attachmentIndex, cache: params.cache, config: params.config, }) : await runProviderEntry({ capability, entry, cfg: params.cfg, ctx: params.ctx, attachmentIndex: params.attachmentIndex, cache: params.cache, agentDir: params.agentDir, providerRegistry: params.providerRegistry, config: params.config, }); if (result) { const decision = buildModelDecision({ entry, entryType, outcome: "success" }); if (result.provider) { decision.provider = result.provider; } if (result.model) { decision.model = result.model; } attempts.push(decision); return { output: result, attempts }; } attempts.push( buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), ); } catch (err) { if (isMediaUnderstandingSkipError(err)) { attempts.push( buildModelDecision({ entry, entryType, outcome: "skipped", reason: `${err.reason}: ${err.message}`, }), ); if (shouldLogVerbose()) { logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); } continue; } attempts.push( buildModelDecision({ entry, entryType, outcome: "failed", reason: String(err), }), ); if (shouldLogVerbose()) { logVerbose(`${capability} understanding failed: ${String(err)}`); } } } return { output: null, attempts }; } export async function runCapability(params: { capability: MediaUnderstandingCapability; cfg: OpenClawConfig; ctx: MsgContext; attachments: MediaAttachmentCache; media: MediaAttachment[]; agentDir?: string; providerRegistry: ProviderRegistry; config?: MediaUnderstandingConfig; activeModel?: ActiveMediaModel; }): Promise { const { capability, cfg, ctx } = params; const config = params.config ?? cfg.tools?.media?.[capability]; if (config?.enabled === false) { return { outputs: [], decision: { capability, outcome: "disabled", attachments: [] }, }; } const attachmentPolicy = config?.attachments; const selected = selectAttachments({ capability, attachments: params.media, policy: attachmentPolicy, }); if (selected.length === 0) { return { outputs: [], decision: { capability, outcome: "no-attachment", attachments: [] }, }; } const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); if (scopeDecision === "deny") { if (shouldLogVerbose()) { logVerbose(`${capability} understanding disabled by scope policy.`); } return { outputs: [], decision: { capability, outcome: "scope-deny", attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), }, }; } // Skip image understanding when the primary model supports vision natively. // The image will be injected directly into the model context instead. const activeProvider = params.activeModel?.provider?.trim(); if (capability === "image" && activeProvider) { const catalog = await loadModelCatalog({ config: cfg }); const entry = findModelInCatalog(catalog, activeProvider, params.activeModel?.model ?? ""); if (modelSupportsVision(entry)) { if (shouldLogVerbose()) { logVerbose("Skipping image understanding: primary model supports vision natively"); } const model = params.activeModel?.model?.trim(); const reason = "primary model supports vision natively"; return { outputs: [], decision: { capability, outcome: "skipped", attachments: selected.map((item) => { const attempt = { type: "provider" as const, provider: activeProvider, model: model || undefined, outcome: "skipped" as const, reason, }; return { attachmentIndex: item.index, attempts: [attempt], chosen: attempt, }; }), }, }; } } const entries = resolveModelEntries({ cfg, capability, config, providerRegistry: params.providerRegistry, }); let resolvedEntries = entries; if (resolvedEntries.length === 0) { resolvedEntries = await resolveAutoEntries({ cfg, agentDir: params.agentDir, providerRegistry: params.providerRegistry, capability, activeModel: params.activeModel, }); } if (resolvedEntries.length === 0) { return { outputs: [], decision: { capability, outcome: "skipped", attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), }, }; } const outputs: MediaUnderstandingOutput[] = []; const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; for (const attachment of selected) { const { output, attempts } = await runAttachmentEntries({ capability, cfg, ctx, attachmentIndex: attachment.index, agentDir: params.agentDir, providerRegistry: params.providerRegistry, cache: params.attachments, entries: resolvedEntries, config, }); if (output) { outputs.push(output); } attachmentDecisions.push({ attachmentIndex: attachment.index, attempts, chosen: attempts.find((attempt) => attempt.outcome === "success"), }); } const decision: MediaUnderstandingDecision = { capability, outcome: outputs.length > 0 ? "success" : "skipped", attachments: attachmentDecisions, }; if (shouldLogVerbose()) { logVerbose(`Media understanding ${formatDecisionSummary(decision)}`); } return { outputs, decision, }; }