openclaw/src/media-understanding/image.ts

233 lines
7.0 KiB
TypeScript

import type { Api, Context, Model } from "@mariozechner/pi-ai";
import { complete } from "@mariozechner/pi-ai";
import { isMinimaxVlmModel, minimaxUnderstandImage } from "../agents/minimax-vlm.js";
import {
getApiKeyForModel,
requireApiKey,
resolveApiKeyForProvider,
} from "../agents/model-auth.js";
import { normalizeModelRef } from "../agents/model-selection.js";
import { ensureOpenClawModelsJson } from "../agents/models-config.js";
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
import type {
ImageDescriptionRequest,
ImageDescriptionResult,
ImagesDescriptionRequest,
ImagesDescriptionResult,
} from "./types.js";
let piModelDiscoveryRuntimePromise: Promise<
typeof import("../agents/pi-model-discovery-runtime.js")
> | null = null;
function loadPiModelDiscoveryRuntime() {
piModelDiscoveryRuntimePromise ??= import("../agents/pi-model-discovery-runtime.js");
return piModelDiscoveryRuntimePromise;
}
function resolveImageToolMaxTokens(modelMaxTokens: number | undefined, requestedMaxTokens = 4096) {
if (
typeof modelMaxTokens !== "number" ||
!Number.isFinite(modelMaxTokens) ||
modelMaxTokens <= 0
) {
return requestedMaxTokens;
}
return Math.min(requestedMaxTokens, modelMaxTokens);
}
async function resolveImageRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
provider: string;
model: string;
profile?: string;
preferredProfile?: string;
}): Promise<{ apiKey: string; model: Model<Api> }> {
await ensureOpenClawModelsJson(params.cfg, params.agentDir);
const { discoverAuthStorage, discoverModels } = await loadPiModelDiscoveryRuntime();
const authStorage = discoverAuthStorage(params.agentDir);
const modelRegistry = discoverModels(authStorage, params.agentDir);
const resolvedRef = normalizeModelRef(params.provider, params.model);
const model = modelRegistry.find(resolvedRef.provider, resolvedRef.model) as Model<Api> | null;
if (!model) {
throw new Error(`Unknown model: ${resolvedRef.provider}/${resolvedRef.model}`);
}
if (!model.input?.includes("image")) {
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
}
const apiKeyInfo = await getApiKeyForModel({
model,
cfg: params.cfg,
agentDir: params.agentDir,
profileId: params.profile,
preferredProfile: params.preferredProfile,
});
const apiKey = requireApiKey(apiKeyInfo, model.provider);
authStorage.setRuntimeApiKey(model.provider, apiKey);
return { apiKey, model };
}
function buildImageContext(
prompt: string,
images: Array<{ buffer: Buffer; mime?: string }>,
): Context {
return {
systemPrompt: prompt,
messages: [
{
role: "user",
content: images.map((image) => ({
type: "image" as const,
data: image.buffer.toString("base64"),
mimeType: image.mime ?? "image/jpeg",
})),
timestamp: Date.now(),
},
],
};
}
async function describeImagesWithMinimax(params: {
apiKey: string;
modelId: string;
modelBaseUrl?: string;
prompt: string;
images: Array<{ buffer: Buffer; mime?: string }>;
}): Promise<ImagesDescriptionResult> {
const responses: string[] = [];
for (const [index, image] of params.images.entries()) {
const prompt =
params.images.length > 1
? `${params.prompt}\n\nDescribe image ${index + 1} of ${params.images.length} independently.`
: params.prompt;
const text = await minimaxUnderstandImage({
apiKey: params.apiKey,
prompt,
imageDataUrl: `data:${image.mime ?? "image/jpeg"};base64,${image.buffer.toString("base64")}`,
modelBaseUrl: params.modelBaseUrl,
});
responses.push(params.images.length > 1 ? `Image ${index + 1}:\n${text.trim()}` : text.trim());
}
return {
text: responses.join("\n\n").trim(),
model: params.modelId,
};
}
function isUnknownModelError(err: unknown): boolean {
return err instanceof Error && /^Unknown model:/i.test(err.message);
}
function resolveConfiguredProviderBaseUrl(
cfg: ImageDescriptionRequest["cfg"],
provider: string,
): string | undefined {
const direct = cfg.models?.providers?.[provider];
if (typeof direct?.baseUrl === "string" && direct.baseUrl.trim()) {
return direct.baseUrl.trim();
}
return undefined;
}
async function resolveMinimaxVlmFallbackRuntime(params: {
cfg: ImageDescriptionRequest["cfg"];
agentDir: string;
provider: string;
profile?: string;
preferredProfile?: string;
}): Promise<{ apiKey: string; modelBaseUrl?: string }> {
const auth = await resolveApiKeyForProvider({
provider: params.provider,
cfg: params.cfg,
profileId: params.profile,
preferredProfile: params.preferredProfile,
agentDir: params.agentDir,
});
return {
apiKey: requireApiKey(auth, params.provider),
modelBaseUrl: resolveConfiguredProviderBaseUrl(params.cfg, params.provider),
};
}
export async function describeImagesWithModel(
params: ImagesDescriptionRequest,
): Promise<ImagesDescriptionResult> {
const prompt = params.prompt ?? "Describe the image.";
let apiKey: string;
let model: Model<Api> | undefined;
try {
const resolved = await resolveImageRuntime(params);
apiKey = resolved.apiKey;
model = resolved.model;
} catch (err) {
if (!isMinimaxVlmModel(params.provider, params.model) || !isUnknownModelError(err)) {
throw err;
}
const fallback = await resolveMinimaxVlmFallbackRuntime(params);
return await describeImagesWithMinimax({
apiKey: fallback.apiKey,
modelId: params.model,
modelBaseUrl: fallback.modelBaseUrl,
prompt,
images: params.images,
});
}
if (isMinimaxVlmModel(model.provider, model.id)) {
return await describeImagesWithMinimax({
apiKey,
modelId: model.id,
modelBaseUrl: model.baseUrl,
prompt,
images: params.images,
});
}
const context = buildImageContext(prompt, params.images);
const controller = new AbortController();
const timeout =
typeof params.timeoutMs === "number" &&
Number.isFinite(params.timeoutMs) &&
params.timeoutMs > 0
? setTimeout(() => controller.abort(), params.timeoutMs)
: undefined;
const message = await complete(model, context, {
apiKey,
maxTokens: resolveImageToolMaxTokens(model.maxTokens, params.maxTokens ?? 512),
signal: controller.signal,
}).finally(() => {
clearTimeout(timeout);
});
const text = coerceImageAssistantText({
message,
provider: model.provider,
model: model.id,
});
return { text, model: model.id };
}
export async function describeImageWithModel(
params: ImageDescriptionRequest,
): Promise<ImageDescriptionResult> {
return await describeImagesWithModel({
images: [
{
buffer: params.buffer,
fileName: params.fileName,
mime: params.mime,
},
],
model: params.model,
provider: params.provider,
prompt: params.prompt,
maxTokens: params.maxTokens,
timeoutMs: params.timeoutMs,
profile: params.profile,
preferredProfile: params.preferredProfile,
agentDir: params.agentDir,
cfg: params.cfg,
});
}