diff --git a/CHANGELOG.md b/CHANGELOG.md index 13d3d08ee90..248ee48fdd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -102,6 +102,7 @@ Docs: https://docs.openclaw.ai - Plugins/facades: guard bundled plugin facade loads with a cache-first sentinel so circular re-entry stops crashing `xai`, `sglang`, and `vllm` during gateway plugin startup. (#57508) Thanks @openperf. - Agents/MCP: dispose bundled MCP runtimes after one-shot `openclaw agent --local` runs finish, while preserving bundled MCP state across in-run retries so local JSON runs exit cleanly without restarting stateful MCP tools mid-run. - Gateway/OpenAI HTTP: restore default operator scopes for bearer-authenticated requests that omit `x-openclaw-scopes`, so headless `/v1/chat/completions` and session-history callers work again after the recent method-scope hardening. (#57596) Thanks @openperf. +- Gateway/attachments: offload large inbound images without leaking `media://` markers into text-only runs, preserve mixed attachment order for model input/transcripts, and fail closed when model image capability cannot be resolved. (#55513) Thanks @Syysean. ## 2026.3.28 diff --git a/src/agents/cli-runner/types.ts b/src/agents/cli-runner/types.ts index 7b4f30f207d..b138206c66f 100644 --- a/src/agents/cli-runner/types.ts +++ b/src/agents/cli-runner/types.ts @@ -4,6 +4,7 @@ import type { OpenClawConfig } from "../../config/config.js"; import type { CliSessionBinding } from "../../config/sessions.js"; import type { SessionSystemPromptReport } from "../../config/sessions/types.js"; import type { CliBackendConfig } from "../../config/types.js"; +import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js"; import type { ResolvedCliBackend } from "../cli-backends.js"; export type RunCliAgentParams = { @@ -28,6 +29,7 @@ export type RunCliAgentParams = { bootstrapPromptWarningSignaturesSeen?: string[]; bootstrapPromptWarningSignature?: string; images?: ImageContent[]; + imageOrder?: PromptImageOrderEntry[]; }; export type CliPreparedBackend = { diff --git a/src/agents/command/attempt-execution.ts b/src/agents/command/attempt-execution.ts index 81d47d09a10..d53ece66e1b 100644 --- a/src/agents/command/attempt-execution.ts +++ b/src/agents/command/attempt-execution.ts @@ -368,6 +368,7 @@ export function runAgentAttempt(params: { bootstrapPromptWarningSignaturesSeen, bootstrapPromptWarningSignature, images: params.isFallbackRetry ? undefined : params.opts.images, + imageOrder: params.isFallbackRetry ? undefined : params.opts.imageOrder, streamParams: params.opts.streamParams, }); return runCliWithSession(cliSessionBinding?.sessionId).catch(async (err) => { @@ -455,6 +456,7 @@ export function runAgentAttempt(params: { skillsSnapshot: params.skillsSnapshot, prompt: effectivePrompt, images: params.isFallbackRetry ? undefined : params.opts.images, + imageOrder: params.isFallbackRetry ? undefined : params.opts.imageOrder, clientTools: params.opts.clientTools, provider: params.providerOverride, model: params.modelOverride, diff --git a/src/agents/command/types.ts b/src/agents/command/types.ts index 4c677a9c9e3..8c3e1f209b1 100644 --- a/src/agents/command/types.ts +++ b/src/agents/command/types.ts @@ -2,6 +2,7 @@ import type { AgentInternalEvent } from "../../agents/internal-events.js"; import type { ClientToolDefinition } from "../../agents/pi-embedded-runner/run/params.js"; import type { SpawnedRunMetadata } from "../../agents/spawned-context.js"; import type { ChannelOutboundTargetMode } from "../../channels/plugins/types.js"; +import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js"; import type { InputProvenance } from "../../sessions/input-provenance.js"; /** Image content block for Claude API multimodal messages. */ @@ -35,6 +36,8 @@ export type AgentCommandOpts = { message: string; /** Optional image attachments for multimodal messages. */ images?: ImageContent[]; + /** Original inline/offloaded attachment order for inbound images. */ + imageOrder?: PromptImageOrderEntry[]; /** Optional client-provided tools (OpenResponses hosted tools). */ clientTools?: ClientToolDefinition[]; /** Agent id override (must exist in config). */ diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 3a90978a0f4..d44925b9e9d 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -506,6 +506,7 @@ export async function runEmbeddedPiAgent( skillsSnapshot: params.skillsSnapshot, prompt, images: params.images, + imageOrder: params.imageOrder, clientTools: params.clientTools, disableTools: params.disableTools, provider, diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 35adb1e3ec4..4cc890ee8d3 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -1477,6 +1477,7 @@ export async function runEmbeddedAttempt( workspaceDir: effectiveWorkspace, model: params.model, existingImages: params.images, + imageOrder: params.imageOrder, maxBytes: MAX_IMAGE_BYTES, maxDimensionPx: resolveImageSanitizationLimits(params.config).maxDimensionPx, workspaceOnly: effectiveFsWorkspaceOnly, diff --git a/src/agents/pi-embedded-runner/run/images.test.ts b/src/agents/pi-embedded-runner/run/images.test.ts index b0dc1008ede..adf5d9c923b 100644 --- a/src/agents/pi-embedded-runner/run/images.test.ts +++ b/src/agents/pi-embedded-runner/run/images.test.ts @@ -8,7 +8,9 @@ import { detectAndLoadPromptImages, detectImageReferences, loadImageFromRef, + mergePromptAttachmentImages, modelSupportsImages, + splitPromptAndAttachmentRefs, } from "./images.js"; function expectNoPromptImages(result: { detectedRefs: unknown[]; images: unknown[] }) { @@ -289,6 +291,47 @@ describe("detectAndLoadPromptImages", () => { expectNoPromptImages(result); }); + it("preserves attachment order when offloaded refs and inline images are mixed", async () => { + const merged = mergePromptAttachmentImages({ + imageOrder: ["offloaded", "inline"], + existingImages: [{ type: "image", data: "small-b", mimeType: "image/png" }], + offloadedImages: [{ type: "image", data: "large-a", mimeType: "image/jpeg" }], + }); + + expect(merged).toEqual([ + { type: "image", data: "large-a", mimeType: "image/jpeg" }, + { type: "image", data: "small-b", mimeType: "image/png" }, + ]); + }); + + it("classifies trailing offloaded refs separately from prompt refs", () => { + const prompt = + "compare [media attached: media://inbound/prompt-ref.png] and ./prompt-b.png\n[media attached: media://inbound/att-b.png]"; + const refs = detectImageReferences(prompt); + + const split = splitPromptAndAttachmentRefs({ + prompt, + refs, + imageOrder: ["inline", "offloaded"], + }); + + expect(split.promptRefs).toEqual([ + { + raw: "media://inbound/prompt-ref.png", + type: "media-uri", + resolved: "media://inbound/prompt-ref.png", + }, + { raw: "./prompt-b.png", type: "path", resolved: "./prompt-b.png" }, + ]); + expect(split.attachmentRefs).toEqual([ + { + raw: "media://inbound/att-b.png", + type: "media-uri", + resolved: "media://inbound/att-b.png", + }, + ]); + }); + it("blocks prompt image refs outside workspace when sandbox workspaceOnly is enabled", async () => { const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-native-image-sandbox-")); const sandboxRoot = path.join(stateDir, "sandbox"); diff --git a/src/agents/pi-embedded-runner/run/images.ts b/src/agents/pi-embedded-runner/run/images.ts index 686a7791e5a..531f79eafbc 100644 --- a/src/agents/pi-embedded-runner/run/images.ts +++ b/src/agents/pi-embedded-runner/run/images.ts @@ -1,6 +1,8 @@ import path from "node:path"; import type { ImageContent } from "@mariozechner/pi-ai"; import { assertNoWindowsNetworkPath, safeFileURLToPath } from "../../../infra/local-file-access.js"; +import type { PromptImageOrderEntry } from "../../../media/prompt-image-order.js"; +import { resolveMediaBufferPath, getMediaDir } from "../../../media/store.js"; import { loadWebMedia } from "../../../media/web-media.js"; import { resolveUserPath } from "../../../utils.js"; import type { ImageSanitizationLimits } from "../../image-sanitization.js"; @@ -38,6 +40,32 @@ const FILE_URL_REGEX_SOURCE = "file://[^\\s<>\"'`\\]]+\\.(?:" + IMAGE_EXTENSION_ const PATH_REGEX_SOURCE = "(?:^|\\s|[\"'`(])((\\.\\.?/|[~/])[^\\s\"'`()\\[\\]]*\\.(?:" + IMAGE_EXTENSION_PATTERN + "))"; +/** + * Matches the opaque media URI written by the Gateway's claim-check offload: + * media://inbound/ + * + * Uses an exclusion-based character class rather than a whitelist so that + * Unicode filenames (e.g. Chinese characters) preserved by sanitizeFilename + * in store.ts are matched correctly. + * + * Explicitly excluded from the ID segment: + * ] — closes the surrounding [media attached: ...] bracket + * \s — any whitespace (space, newline, tab) — terminates the token + * / — forward slash path separator (traversal prevention) + * \ — back slash path separator (traversal prevention) + * \x00 — null byte (path injection prevention) + * + * resolveMediaBufferPath applies its own guards against these characters, but + * excluding them here provides defence-in-depth at the parsing layer. + * + * Example valid IDs: + * "1c77ce17-20b9-4546-be64-6e36a9adcb2c.png" + * "photo---1c77ce17-20b9-4546-be64-6e36a9adcb2c.png" + * "图片---1c77ce17-20b9-4546-be64-6e36a9adcb2c.png" + */ +// eslint-disable-next-line no-control-regex +const MEDIA_URI_REGEX = /\bmedia:\/\/inbound\/([^\]\s/\\\x00]+)/; + /** * Result of detecting an image reference in text. */ @@ -45,8 +73,8 @@ export interface DetectedImageRef { /** The raw matched string from the prompt */ raw: string; /** The type of reference */ - type: "path"; - /** The resolved/normalized path */ + type: "path" | "media-uri"; + /** The resolved/normalized path, or the raw media URI for media-uri type */ resolved: string; } @@ -62,6 +90,105 @@ function normalizeRefForDedupe(raw: string): string { return process.platform === "win32" ? raw.toLowerCase() : raw; } +export function mergePromptAttachmentImages(params: { + imageOrder?: PromptImageOrderEntry[]; + existingImages?: ImageContent[]; + offloadedImages?: Array; + promptRefImages?: ImageContent[]; +}): ImageContent[] { + const promptImages: ImageContent[] = []; + const existingImages = params.existingImages ?? []; + const offloadedImages = params.offloadedImages ?? []; + + if (params.imageOrder && params.imageOrder.length > 0) { + let inlineIndex = 0; + let offloadedIndex = 0; + for (const entry of params.imageOrder) { + if (entry === "inline") { + const image = existingImages[inlineIndex++]; + if (image) { + promptImages.push(image); + } + continue; + } + const image = offloadedImages[offloadedIndex++]; + if (image) { + promptImages.push(image); + } + } + while (inlineIndex < existingImages.length) { + promptImages.push(existingImages[inlineIndex++]); + } + while (offloadedIndex < offloadedImages.length) { + const image = offloadedImages[offloadedIndex++]; + if (image) { + promptImages.push(image); + } + } + } else { + promptImages.push(...existingImages); + for (const image of offloadedImages) { + if (image) { + promptImages.push(image); + } + } + } + + promptImages.push(...(params.promptRefImages ?? [])); + return promptImages; +} + +function extractTrailingAttachmentMediaUris(prompt: string, count: number): string[] { + if (count <= 0) { + return []; + } + + const lines = prompt.split(/\r?\n/); + const uris: string[] = []; + for (let index = lines.length - 1; index >= 0 && uris.length < count; index--) { + const line = lines[index]?.trim(); + if (!line || line.includes("\0")) { + break; + } + const match = line.match(/^\[media attached:\s*(media:\/\/inbound\/[^\]\s/\\]+)\]$/); + if (!match?.[1]) { + break; + } + uris.unshift(match[1]); + } + return uris; +} + +export function splitPromptAndAttachmentRefs(params: { + prompt: string; + refs: DetectedImageRef[]; + imageOrder?: PromptImageOrderEntry[]; +}): { + promptRefs: DetectedImageRef[]; + attachmentRefs: DetectedImageRef[]; +} { + const offloadedCount = params.imageOrder?.filter((entry) => entry === "offloaded").length ?? 0; + if (offloadedCount === 0) { + return { promptRefs: params.refs, attachmentRefs: [] }; + } + + const attachmentUris = new Set(extractTrailingAttachmentMediaUris(params.prompt, offloadedCount)); + if (attachmentUris.size === 0) { + return { promptRefs: params.refs, attachmentRefs: [] }; + } + + const promptRefs: DetectedImageRef[] = []; + const attachmentRefs: DetectedImageRef[] = []; + for (const ref of params.refs) { + if (ref.type === "media-uri" && attachmentUris.has(ref.resolved)) { + attachmentRefs.push(ref); + continue; + } + promptRefs.push(ref); + } + return { promptRefs, attachmentRefs }; +} + async function sanitizeImagesWithLog( images: ImageContent[], label: string, @@ -87,6 +214,7 @@ async function sanitizeImagesWithLog( * - Home paths: ~/Pictures/screenshot.png * - file:// URLs: file:///path/to/image.png * - Message attachments: [Image: source: /path/to/image.jpg] + * - Gateway claim-check URIs: [media attached: media://inbound/] * * @param prompt The user prompt text to scan * @returns Array of detected image references @@ -135,6 +263,20 @@ export function detectImageReferences(prompt: string): DetectedImageRef[] { continue; } + // Check for a Gateway claim-check URI first (media://inbound/). + // This must be tested before the extension-based path regex because the + // URI has no file extension suffix in its base form. + const mediaUriMatch = content.match(MEDIA_URI_REGEX); + if (mediaUriMatch) { + const uri = `media://inbound/${mediaUriMatch[1]}`; + const dedupeKey = normalizeRefForDedupe(uri); + if (!seen.has(dedupeKey)) { + seen.add(dedupeKey); + refs.push({ raw: uri, type: "media-uri", resolved: uri }); + } + continue; + } + // Extract path before the (mime/type) or | delimiter // Format is: path (type) | url OR just: path (type) // Path may contain spaces (e.g., "ChatGPT Image Apr 21.png") @@ -205,6 +347,44 @@ export async function loadImageFromRef( sandbox?: { root: string; bridge: SandboxFsBridge }; }, ): Promise { + // Handle Gateway claim-check URIs (media://inbound/). + // These are written by the Gateway's offload path and point to files that + // the Gateway has already validated and persisted. They are intentionally + // exempt from workspaceOnly checks because they live in the media store + // managed by the Gateway, not in the agent workspace. + if (ref.type === "media-uri") { + const uriMatch = ref.resolved.match(MEDIA_URI_REGEX); + if (!uriMatch) { + log.debug(`Native image: malformed media URI, skipping: ${ref.resolved}`); + return null; + } + const mediaId = uriMatch[1]; + try { + // resolveMediaBufferPath accepts the media ID (with optional extension + // and original-filename prefix) and returns the absolute path of the + // persisted file. It applies its own guards against path traversal, + // symlinks, and null bytes. + const physicalPath = await resolveMediaBufferPath(mediaId, "inbound"); + const media = await loadWebMedia(physicalPath, { + maxBytes: options?.maxBytes, + localRoots: [getMediaDir()], + }); + if (media.kind !== "image") { + log.debug(`Native image: media store entry is not an image: ${mediaId}`); + return null; + } + const mimeType = media.contentType ?? "image/jpeg"; + const data = media.buffer.toString("base64"); + log.debug(`Native image: loaded media-uri ${ref.resolved} -> ${physicalPath}`); + return { type: "image", data, mimeType }; + } catch (err) { + log.debug( + `Native image: failed to load media-uri ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`, + ); + return null; + } + } + try { let targetPath = ref.resolved; @@ -292,6 +472,7 @@ export async function detectAndLoadPromptImages(params: { workspaceDir: string; model: { input?: string[] }; existingImages?: ImageContent[]; + imageOrder?: PromptImageOrderEntry[]; maxBytes?: number; maxDimensionPx?: number; workspaceOnly?: boolean; @@ -326,20 +507,25 @@ export async function detectAndLoadPromptImages(params: { } log.debug(`Native image: detected ${allRefs.length} image refs in prompt`); - - const promptImages: ImageContent[] = [...(params.existingImages ?? [])]; + const { promptRefs, attachmentRefs } = splitPromptAndAttachmentRefs({ + prompt: params.prompt, + refs: allRefs, + imageOrder: params.imageOrder, + }); + const promptRefImages: ImageContent[] = []; + const offloadedImages: Array = []; let loadedCount = 0; let skippedCount = 0; - for (const ref of allRefs) { + for (const ref of promptRefs) { const image = await loadImageFromRef(ref, params.workspaceDir, { maxBytes: params.maxBytes, workspaceOnly: params.workspaceOnly, sandbox: params.sandbox, }); if (image) { - promptImages.push(image); + promptRefImages.push(image); loadedCount++; log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); } else { @@ -347,6 +533,28 @@ export async function detectAndLoadPromptImages(params: { } } + for (const ref of attachmentRefs) { + const image = await loadImageFromRef(ref, params.workspaceDir, { + maxBytes: params.maxBytes, + workspaceOnly: params.workspaceOnly, + sandbox: params.sandbox, + }); + offloadedImages.push(image); + if (image) { + loadedCount++; + log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); + } else { + skippedCount++; + } + } + + const promptImages = mergePromptAttachmentImages({ + imageOrder: params.imageOrder, + existingImages: params.existingImages, + offloadedImages, + promptRefImages, + }); + const imageSanitization: ImageSanitizationLimits = { maxDimensionPx: params.maxDimensionPx, }; diff --git a/src/agents/pi-embedded-runner/run/params.ts b/src/agents/pi-embedded-runner/run/params.ts index 9ed4411cd34..88d4f0d114e 100644 --- a/src/agents/pi-embedded-runner/run/params.ts +++ b/src/agents/pi-embedded-runner/run/params.ts @@ -2,6 +2,7 @@ import type { ImageContent } from "@mariozechner/pi-ai"; import type { ReasoningLevel, ThinkLevel, VerboseLevel } from "../../../auto-reply/thinking.js"; import type { ReplyPayload } from "../../../auto-reply/types.js"; import type { OpenClawConfig } from "../../../config/config.js"; +import type { PromptImageOrderEntry } from "../../../media/prompt-image-order.js"; import type { enqueueCommand } from "../../../process/command-queue.js"; import type { InputProvenance } from "../../../sessions/input-provenance.js"; import type { ExecElevatedDefaults, ExecToolDefaults } from "../../bash-tools.js"; @@ -76,6 +77,7 @@ export type RunEmbeddedPiAgentParams = { skillsSnapshot?: SkillSnapshot; prompt: string; images?: ImageContent[]; + imageOrder?: PromptImageOrderEntry[]; /** Optional client-provided tools (OpenResponses hosted tools). */ clientTools?: ClientToolDefinition[]; /** Disable built-in tools for this run (LLM-only mode). */ diff --git a/src/agents/skills-install.ts b/src/agents/skills-install.ts index f1f7d8a4480..b5db83c89d1 100644 --- a/src/agents/skills-install.ts +++ b/src/agents/skills-install.ts @@ -552,7 +552,6 @@ export async function installSkill(params: SkillInstallRequest): Promise void; onReplyStart?: () => Promise | void; diff --git a/src/gateway/chat-attachments.ts b/src/gateway/chat-attachments.ts index 53dcf6a288e..6315f6057a4 100644 --- a/src/gateway/chat-attachments.ts +++ b/src/gateway/chat-attachments.ts @@ -1,5 +1,7 @@ import { estimateBase64DecodedBytes } from "../media/base64.js"; +import type { PromptImageOrderEntry } from "../media/prompt-image-order.js"; import { sniffMimeFromBase64 } from "../media/sniff-mime-from-base64.js"; +import { deleteMediaBuffer, saveMediaBuffer } from "../media/store.js"; export type ChatAttachment = { type?: string; @@ -14,12 +16,52 @@ export type ChatImageContent = { mimeType: string; }; +/** + * Metadata for an attachment that was offloaded to the media store. + * + * Included in ParsedMessageWithImages.offloadedRefs so that callers can + * persist structured media metadata for transcripts. Without this, consumers + * that derive MediaPath/MediaPaths from the `images` array (e.g. + * persistChatSendImages and buildChatSendTranscriptMessage in chat.ts) would + * silently omit all large attachments that were offloaded to disk. + */ +export type OffloadedRef = { + /** Opaque media URI injected into the message, e.g. "media://inbound/" */ + mediaRef: string; + /** The raw media ID from SavedMedia.id, usable with resolveMediaBufferPath */ + id: string; + /** Absolute filesystem path returned by saveMediaBuffer — used for transcript MediaPath */ + path: string; + /** MIME type of the offloaded attachment */ + mimeType: string; + /** The label / filename of the original attachment */ + label: string; +}; + export type ParsedMessageWithImages = { message: string; + /** Small attachments (≤ OFFLOAD_THRESHOLD_BYTES) passed inline to the model */ images: ChatImageContent[]; + /** Original accepted attachment order after inline/offloaded split. */ + imageOrder: PromptImageOrderEntry[]; + /** + * Large attachments (> OFFLOAD_THRESHOLD_BYTES) that were offloaded to the + * media store. Each entry corresponds to a `[media attached: media://inbound/]` + * marker appended to `message`. + * + * Callers MUST persist this list separately for transcript media metadata. + * It is intentionally separate from `images` because downstream model calls + * do not receive these as inline image blocks. + * + * ⚠️ Call sites (chat.ts, agent.ts, server-node-events.ts) MUST also pass + * `supportsImages: modelSupportsImages(model)` so that text-only model runs + * do not inject unresolvable media:// markers into prompt text. + */ + offloadedRefs: OffloadedRef[]; }; type AttachmentLog = { + info?: (message: string) => void; warn: (message: string) => void; }; @@ -29,6 +71,59 @@ type NormalizedAttachment = { base64: string; }; +type SavedMedia = { + id: string; + path?: string; +}; + +const OFFLOAD_THRESHOLD_BYTES = 2_000_000; + +const MIME_TO_EXT: Record = { + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", + "image/gif": ".gif", + "image/heic": ".heic", + "image/heif": ".heif", + // bmp/tiff excluded from SUPPORTED_OFFLOAD_MIMES to avoid extension-loss + // bug in store.ts; entries kept here for future extension support + "image/bmp": ".bmp", + "image/tiff": ".tiff", +}; + +// Module-level Set for O(1) lookup — not rebuilt on every attachment iteration. +// +// heic/heif are included only if store.ts's extensionForMime maps them to an +// extension. If it does not (same extension-loss risk as bmp/tiff), remove +// them from this set. +const SUPPORTED_OFFLOAD_MIMES = new Set([ + "image/jpeg", + "image/jpg", + "image/png", + "image/webp", + "image/gif", + "image/heic", + "image/heif", +]); + +/** + * Raised when the Gateway cannot persist an attachment to the media store. + * + * Distinct from ordinary input-validation errors so that Gateway handlers can + * map it to a server-side 5xx status rather than a client 4xx. + * + * Example causes: ENOSPC, EPERM, unexpected saveMediaBuffer return shape. + */ +export class MediaOffloadError extends Error { + readonly cause: unknown; + constructor(message: string, options?: ErrorOptions) { + super(message, options); + this.name = "MediaOffloadError"; + this.cause = options?.cause; + } +} + function normalizeMime(mime?: string): string | undefined { if (!mime) { return undefined; @@ -42,8 +137,75 @@ function isImageMime(mime?: string): boolean { } function isValidBase64(value: string): boolean { - // Minimal validation; avoid full decode allocations for large payloads. - return value.length > 0 && value.length % 4 === 0 && /^[A-Za-z0-9+/]+={0,2}$/.test(value); + if (value.length === 0 || value.length % 4 !== 0) { + return false; + } + // A full O(n) regex scan is safe: no overlapping quantifiers, fails linearly. + // Prevents adversarial payloads padded with megabytes of whitespace from + // bypassing length thresholds. + return /^[A-Za-z0-9+/]+={0,2}$/.test(value); +} + +/** + * Confirms that the decoded buffer produced by Buffer.from(b64, 'base64') + * matches the pre-decode size estimate. + * + * Node's Buffer.from silently drops invalid base64 characters rather than + * throwing. A material size discrepancy means the source string contained + * embedded garbage that was silently stripped, which would produce a corrupted + * file on disk. ±3 bytes of slack accounts for base64 padding rounding. + * + * IMPORTANT: this is an input-validation check (4xx client error). + * It MUST be called OUTSIDE the MediaOffloadError try/catch so that + * corrupt-input errors are not misclassified as 5xx server errors. + */ +function verifyDecodedSize(buffer: Buffer, estimatedBytes: number, label: string): void { + if (Math.abs(buffer.byteLength - estimatedBytes) > 3) { + throw new Error( + `attachment ${label}: base64 contains invalid characters ` + + `(expected ~${estimatedBytes} bytes decoded, got ${buffer.byteLength})`, + ); + } +} + +function ensureExtension(label: string, mime: string): string { + if (/\.[a-zA-Z0-9]+$/.test(label)) { + return label; + } + const ext = MIME_TO_EXT[mime.toLowerCase()] ?? ""; + return ext ? `${label}${ext}` : label; +} + +/** + * Type guard for the return value of saveMediaBuffer. + * + * Also validates that the returned ID: + * - is a non-empty string + * - contains no path separators (/ or \) or null bytes + * + * Catching a bad shape here produces a cleaner error than a cryptic failure + * deeper in the stack, and is treated as a 5xx infrastructure error. + */ +function assertSavedMedia(value: unknown, label: string): SavedMedia { + if ( + value !== null && + typeof value === "object" && + "id" in value && + typeof (value as Record).id === "string" + ) { + const id = (value as Record).id as string; + if (id.length === 0) { + throw new Error(`attachment ${label}: saveMediaBuffer returned an empty media ID`); + } + if (id.includes("/") || id.includes("\\") || id.includes("\0")) { + throw new Error( + `attachment ${label}: saveMediaBuffer returned an unsafe media ID ` + + `(contains path separator or null byte)`, + ); + } + return value as SavedMedia; + } + throw new Error(`attachment ${label}: saveMediaBuffer returned an unexpected shape`); } function normalizeAttachment( @@ -64,7 +226,6 @@ function normalizeAttachment( let base64 = content.trim(); if (opts.stripDataUrlPrefix) { - // Strip data URL prefix if present (e.g., "data:image/jpeg;base64,..."). const dataUrlMatch = /^data:[^;]+;base64,(.*)$/.exec(base64); if (dataUrlMatch) { base64 = dataUrlMatch[1]; @@ -91,57 +252,211 @@ function validateAttachmentBase64OrThrow( /** * Parse attachments and extract images as structured content blocks. - * Returns the message text and an array of image content blocks - * compatible with Claude API's image format. + * Returns the message text, inline image blocks, and offloaded media refs. + * + * ## Offload behaviour + * Attachments whose decoded size exceeds OFFLOAD_THRESHOLD_BYTES are saved to + * disk via saveMediaBuffer and replaced with an opaque `media://inbound/` + * URI appended to the message. The agent resolves these URIs via + * resolveMediaBufferPath before passing them to the model. + * + * ## Transcript metadata + * Callers MUST use `result.offloadedRefs` to persist structured media metadata + * for transcripts. These refs are intentionally excluded from `result.images` + * because they are not passed inline to the model. + * + * ## Text-only model runs + * Pass `supportsImages: false` for text-only model runs so that no media:// + * markers are injected into prompt text. + * + * ⚠️ Call sites in chat.ts, agent.ts, and server-node-events.ts MUST be + * updated to pass `supportsImages: modelSupportsImages(model)`. Until they do, + * text-only model runs receive unresolvable media:// markers in their prompt. + * + * ## Cleanup on failure + * On any parse failure after files have already been offloaded, best-effort + * cleanup is performed before rethrowing so that malformed requests do not + * accumulate orphaned files on disk ahead of the periodic TTL sweep. + * + * ## Known ordering limitation + * In mixed large/small batches, the model receives images in a different order + * than the original attachment list because detectAndLoadPromptImages + * initialises from existingImages first, then appends prompt-detected refs. + * A future refactor should unify all image references into a single ordered list. + * + * @throws {MediaOffloadError} Infrastructure failure saving to media store → 5xx. + * @throws {Error} Input validation failure → 4xx. */ export async function parseMessageWithAttachments( message: string, attachments: ChatAttachment[] | undefined, - opts?: { maxBytes?: number; log?: AttachmentLog }, + opts?: { maxBytes?: number; log?: AttachmentLog; supportsImages?: boolean }, ): Promise { - const maxBytes = opts?.maxBytes ?? 5_000_000; // decoded bytes (5,000,000) + const maxBytes = opts?.maxBytes ?? 5_000_000; const log = opts?.log; + if (!attachments || attachments.length === 0) { - return { message, images: [] }; + return { message, images: [], imageOrder: [], offloadedRefs: [] }; + } + + // For text-only models drop all attachments cleanly. Do not save files or + // inject media:// markers that would never be resolved and would leak + // internal path references into the model's prompt. + if (opts?.supportsImages === false) { + if (attachments.length > 0) { + log?.warn( + `parseMessageWithAttachments: ${attachments.length} attachment(s) dropped — model does not support images`, + ); + } + return { message, images: [], imageOrder: [], offloadedRefs: [] }; } const images: ChatImageContent[] = []; + const imageOrder: PromptImageOrderEntry[] = []; + const offloadedRefs: OffloadedRef[] = []; + let updatedMessage = message; - for (const [idx, att] of attachments.entries()) { - if (!att) { - continue; - } - const normalized = normalizeAttachment(att, idx, { - stripDataUrlPrefix: true, - requireImageMime: false, - }); - validateAttachmentBase64OrThrow(normalized, { maxBytes }); - const { base64: b64, label, mime } = normalized; + // Track IDs of files saved during this request for cleanup if a later + // attachment fails validation and the entire parse is aborted. + const savedMediaIds: string[] = []; - const providedMime = normalizeMime(mime); - const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64)); - if (sniffedMime && !isImageMime(sniffedMime)) { - log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`); - continue; - } - if (!sniffedMime && !isImageMime(providedMime)) { - log?.warn(`attachment ${label}: unable to detect image mime type, dropping`); - continue; - } - if (sniffedMime && providedMime && sniffedMime !== providedMime) { - log?.warn( - `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`, - ); - } + try { + for (const [idx, att] of attachments.entries()) { + if (!att) { + continue; + } - images.push({ - type: "image", - data: b64, - mimeType: sniffedMime ?? providedMime ?? mime, - }); + const normalized = normalizeAttachment(att, idx, { + stripDataUrlPrefix: true, + requireImageMime: false, + }); + + const { base64: b64, label, mime } = normalized; + + if (!isValidBase64(b64)) { + throw new Error(`attachment ${label}: invalid base64 content`); + } + + const sizeBytes = estimateBase64DecodedBytes(b64); + if (sizeBytes <= 0) { + log?.warn(`attachment ${label}: estimated size is zero, dropping`); + continue; + } + + if (sizeBytes > maxBytes) { + throw new Error( + `attachment ${label}: exceeds size limit (${sizeBytes} > ${maxBytes} bytes)`, + ); + } + + const providedMime = normalizeMime(mime); + const sniffedMime = normalizeMime(await sniffMimeFromBase64(b64)); + + if (sniffedMime && !isImageMime(sniffedMime)) { + log?.warn(`attachment ${label}: detected non-image (${sniffedMime}), dropping`); + continue; + } + if (!sniffedMime && !isImageMime(providedMime)) { + log?.warn(`attachment ${label}: unable to detect image mime type, dropping`); + continue; + } + if (sniffedMime && providedMime && sniffedMime !== providedMime) { + log?.warn( + `attachment ${label}: mime mismatch (${providedMime} -> ${sniffedMime}), using sniffed`, + ); + } + + // Third fallback normalises `mime` so a raw un-normalised string (e.g. + // "IMAGE/JPEG") does not silently bypass the SUPPORTED_OFFLOAD_MIMES check. + const finalMime = sniffedMime ?? providedMime ?? normalizeMime(mime) ?? mime; + + let isOffloaded = false; + + if (sizeBytes > OFFLOAD_THRESHOLD_BYTES) { + const isSupportedForOffload = SUPPORTED_OFFLOAD_MIMES.has(finalMime); + + if (!isSupportedForOffload) { + // Passing this inline would reintroduce the OOM risk this PR prevents. + throw new Error( + `attachment ${label}: format ${finalMime} is too large to pass inline ` + + `(${sizeBytes} > ${OFFLOAD_THRESHOLD_BYTES} bytes) and cannot be offloaded. ` + + `Please convert to JPEG, PNG, WEBP, GIF, HEIC, or HEIF.`, + ); + } + + // Decode and run input-validation BEFORE the MediaOffloadError try/catch. + // verifyDecodedSize is a 4xx client error and must not be wrapped as a + // 5xx MediaOffloadError. + const buffer = Buffer.from(b64, "base64"); + verifyDecodedSize(buffer, sizeBytes, label); + + // Only the storage operation is wrapped so callers can distinguish + // infrastructure failures (5xx) from input errors (4xx). + try { + const labelWithExt = ensureExtension(label, finalMime); + + const rawResult = await saveMediaBuffer( + buffer, + finalMime, + "inbound", + maxBytes, + labelWithExt, + ); + + const savedMedia = assertSavedMedia(rawResult, label); + + // Track for cleanup if a subsequent attachment fails. + savedMediaIds.push(savedMedia.id); + + // Opaque URI — compatible with workspaceOnly sandboxes and decouples + // the Gateway from the agent's filesystem layout. + const mediaRef = `media://inbound/${savedMedia.id}`; + + updatedMessage += `\n[media attached: ${mediaRef}]`; + log?.info?.(`[Gateway] Intercepted large image payload. Saved: ${mediaRef}`); + + // Record for transcript metadata — separate from `images` because + // these are not passed inline to the model. + offloadedRefs.push({ + mediaRef, + id: savedMedia.id, + path: savedMedia.path ?? "", + mimeType: finalMime, + label, + }); + imageOrder.push("offloaded"); + + isOffloaded = true; + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err); + throw new MediaOffloadError( + `[Gateway Error] Failed to save intercepted media to disk: ${errorMessage}`, + { cause: err }, + ); + } + } + + if (isOffloaded) { + continue; + } + + images.push({ type: "image", data: b64, mimeType: finalMime }); + imageOrder.push("inline"); + } + } catch (err) { + // Best-effort cleanup before rethrowing. + if (savedMediaIds.length > 0) { + await Promise.allSettled(savedMediaIds.map((id) => deleteMediaBuffer(id, "inbound"))); + } + throw err; } - return { message, images }; + return { + message: updatedMessage !== message ? updatedMessage.trimEnd() : message, + images, + imageOrder, + offloadedRefs, + }; } /** @@ -153,7 +468,8 @@ export function buildMessageWithAttachments( attachments: ChatAttachment[] | undefined, opts?: { maxBytes?: number }, ): string { - const maxBytes = opts?.maxBytes ?? 2_000_000; // 2 MB + const maxBytes = opts?.maxBytes ?? 2_000_000; + if (!attachments || attachments.length === 0) { return message; } @@ -164,21 +480,22 @@ export function buildMessageWithAttachments( if (!att) { continue; } + const normalized = normalizeAttachment(att, idx, { stripDataUrlPrefix: false, requireImageMime: true, }); validateAttachmentBase64OrThrow(normalized, { maxBytes }); - const { base64, label, mime } = normalized; + const { base64, label, mime } = normalized; const safeLabel = label.replace(/\s+/g, "_"); - const dataUrl = `![${safeLabel}](data:${mime};base64,${base64})`; - blocks.push(dataUrl); + blocks.push(`![${safeLabel}](data:${mime};base64,${base64})`); } if (blocks.length === 0) { return message; } + const separator = message.trim().length > 0 ? "\n\n" : ""; return `${message}${separator}${blocks.join("\n\n")}`; } diff --git a/src/gateway/server-methods/agent.ts b/src/gateway/server-methods/agent.ts index 43d1617f159..7ce897b7018 100644 --- a/src/gateway/server-methods/agent.ts +++ b/src/gateway/server-methods/agent.ts @@ -23,6 +23,7 @@ import { } from "../../infra/outbound/agent-delivery.js"; import { shouldDowngradeDeliveryToSessionOnly } from "../../infra/outbound/best-effort-delivery.js"; import { resolveMessageChannelSelection } from "../../infra/outbound/channel-selection.js"; +import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js"; import { classifySessionKeyShape, normalizeAgentId } from "../../routing/session-key.js"; import { defaultRuntime } from "../../runtime.js"; import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js"; @@ -39,7 +40,7 @@ import { normalizeMessageChannel, } from "../../utils/message-channel.js"; import { resolveAssistantIdentity } from "../assistant-identity.js"; -import { parseMessageWithAttachments } from "../chat-attachments.js"; +import { MediaOffloadError, parseMessageWithAttachments } from "../chat-attachments.js"; import { resolveAssistantAvatarUrl } from "../control-ui-shared.js"; import { ADMIN_SCOPE } from "../method-scopes.js"; import { GATEWAY_CLIENT_CAPS, hasGatewayClientCap } from "../protocol/client-info.js"; @@ -58,6 +59,8 @@ import { loadGatewaySessionRow, loadSessionEntry, migrateAndPruneGatewaySessionStoreKey, + resolveGatewayModelSupportsImages, + resolveSessionModelRef, } from "../session-utils.js"; import { formatForLog } from "../ws-log.js"; import { waitForAgentJob } from "./agent-job.js"; @@ -346,16 +349,53 @@ export const agentHandlers: GatewayRequestHandlers = { let message = (request.message ?? "").trim(); let images: Array<{ type: "image"; data: string; mimeType: string }> = []; + let imageOrder: PromptImageOrderEntry[] = []; if (normalizedAttachments.length > 0) { + const requestedSessionKeyRaw = + typeof request.sessionKey === "string" && request.sessionKey.trim() + ? request.sessionKey.trim() + : undefined; + + let baseProvider: string | undefined; + let baseModel: string | undefined; + if (requestedSessionKeyRaw) { + const { cfg: sessCfg, entry: sessEntry } = loadSessionEntry(requestedSessionKeyRaw); + const modelRef = resolveSessionModelRef(sessCfg, sessEntry, undefined); + baseProvider = modelRef.provider; + baseModel = modelRef.model; + } + const effectiveProvider = providerOverride || baseProvider; + const effectiveModel = modelOverride || baseModel; + const supportsImages = await resolveGatewayModelSupportsImages({ + loadGatewayModelCatalog: context.loadGatewayModelCatalog, + provider: effectiveProvider, + model: effectiveModel, + }); + try { const parsed = await parseMessageWithAttachments(message, normalizedAttachments, { maxBytes: 5_000_000, log: context.logGateway, + supportsImages, }); message = parsed.message.trim(); images = parsed.images; + imageOrder = parsed.imageOrder; + // offloadedRefs are appended as text markers to `message`; the agent + // runner will resolve them via detectAndLoadPromptImages. } catch (err) { - respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, String(err))); + // MediaOffloadError indicates a server-side storage fault (ENOSPC, EPERM, + // etc.). Map it to UNAVAILABLE so clients can retry without treating it as + // a bad request. All other errors are input-validation failures → 4xx. + const isServerFault = err instanceof MediaOffloadError; + respond( + false, + undefined, + errorShape( + isServerFault ? ErrorCodes.UNAVAILABLE : ErrorCodes.INVALID_REQUEST, + String(err), + ), + ); return; } } @@ -765,6 +805,7 @@ export const agentHandlers: GatewayRequestHandlers = { ingressOpts: { message, images, + imageOrder, provider: providerOverride, model: modelOverride, to: resolvedTo, diff --git a/src/gateway/server-methods/chat.ts b/src/gateway/server-methods/chat.ts index 5b38db73903..5ebe2732838 100644 --- a/src/gateway/server-methods/chat.ts +++ b/src/gateway/server-methods/chat.ts @@ -12,6 +12,7 @@ import { isSilentReplyText, SILENT_REPLY_TOKEN } from "../../auto-reply/tokens.j import type { ReplyPayload } from "../../auto-reply/types.js"; import { resolveSessionFilePath } from "../../config/sessions.js"; import { jsonUtf8Bytes } from "../../infra/json-utf8-bytes.js"; +import type { PromptImageOrderEntry } from "../../media/prompt-image-order.js"; import { type SavedMedia, saveMediaBuffer } from "../../media/store.js"; import { createChannelReplyPipeline } from "../../plugin-sdk/channel-reply-pipeline.js"; import { normalizeInputProvenance, type InputProvenance } from "../../sessions/input-provenance.js"; @@ -35,7 +36,12 @@ import { isChatStopCommandText, resolveChatRunExpiresAtMs, } from "../chat-abort.js"; -import { type ChatImageContent, parseMessageWithAttachments } from "../chat-attachments.js"; +import { + type ChatImageContent, + MediaOffloadError, + type OffloadedRef, + parseMessageWithAttachments, +} from "../chat-attachments.js"; import { stripEnvelopeFromMessage, stripEnvelopeFromMessages } from "../chat-sanitize.js"; import { augmentChatHistoryWithCliSessionImports } from "../cli-session-history.js"; import { ADMIN_SCOPE } from "../method-scopes.js"; @@ -59,6 +65,7 @@ import { getMaxChatHistoryMessagesBytes } from "../server-constants.js"; import { capArrayByJsonBytes, loadSessionEntry, + resolveGatewayModelSupportsImages, readSessionMessages, resolveSessionModelRef, } from "../session-utils.js"; @@ -326,16 +333,50 @@ function canInjectSystemProvenance(client: GatewayRequestHandlerOptions["client" return scopes.includes(ADMIN_SCOPE); } +/** + * Persist inline images and offloaded-ref media to the transcript media store. + * + * Inline images are re-saved from their base64 payload so that a stable + * filesystem path can be stored in the transcript. Offloaded refs are already + * on disk (saved by parseMessageWithAttachments); their SavedMedia metadata is + * synthesised directly from the OffloadedRef, avoiding a redundant write. + * + * Both sets are combined so that transcript media fields remain complete + * regardless of whether attachments were inlined or offloaded. + */ async function persistChatSendImages(params: { images: ChatImageContent[]; + imageOrder: PromptImageOrderEntry[]; + offloadedRefs: OffloadedRef[]; client: GatewayRequestHandlerOptions["client"]; logGateway: GatewayRequestContext["logGateway"]; }): Promise { - if (params.images.length === 0 || isAcpBridgeClient(params.client)) { + if (isAcpBridgeClient(params.client)) { return []; } + const saved: SavedMedia[] = []; - for (const img of params.images) { + let inlineIndex = 0; + let offloadedIndex = 0; + for (const entry of params.imageOrder) { + if (entry === "offloaded") { + const ref = params.offloadedRefs[offloadedIndex++]; + if (!ref) { + continue; + } + saved.push({ + id: ref.id, + path: ref.path, + size: 0, + contentType: ref.mimeType, + }); + continue; + } + + const img = params.images[inlineIndex++]; + if (!img) { + continue; + } try { saved.push(await saveMediaBuffer(Buffer.from(img.data, "base64"), img.mimeType, "inbound")); } catch (err) { @@ -344,6 +385,7 @@ async function persistChatSendImages(params: { ); } } + return saved; } @@ -1387,23 +1429,18 @@ export const chatHandlers: GatewayRequestHandlers = { ); return; } - let parsedMessage = inboundMessage; - let parsedImages: ChatImageContent[] = []; - if (normalizedAttachments.length > 0) { - try { - const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, { - maxBytes: 5_000_000, - log: context.logGateway, - }); - parsedMessage = parsed.message; - parsedImages = parsed.images; - } catch (err) { - respond(false, undefined, errorShape(ErrorCodes.INVALID_REQUEST, String(err))); - return; - } - } + + // Load session entry before attachment parsing so we can gate media-URI + // marker injection on the model's image capability. This prevents opaque + // media:// markers from leaking into prompts for text-only model runs. const rawSessionKey = p.sessionKey; const { cfg, entry, canonicalKey: sessionKey } = loadSessionEntry(rawSessionKey); + + let parsedMessage = inboundMessage; + let parsedImages: ChatImageContent[] = []; + let parsedImageOrder: PromptImageOrderEntry[] = []; + let parsedOffloadedRefs: OffloadedRef[] = []; + const timeoutMs = resolveAgentTimeoutMs({ cfg, overrideMs: p.timeoutMs, @@ -1461,6 +1498,43 @@ export const chatHandlers: GatewayRequestHandlers = { return; } + if (normalizedAttachments.length > 0) { + const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg }); + const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId); + const supportsImages = await resolveGatewayModelSupportsImages({ + loadGatewayModelCatalog: context.loadGatewayModelCatalog, + provider: modelRef.provider, + model: modelRef.model, + }); + + try { + const parsed = await parseMessageWithAttachments(inboundMessage, normalizedAttachments, { + maxBytes: 5_000_000, + log: context.logGateway, + supportsImages, + }); + parsedMessage = parsed.message; + parsedImages = parsed.images; + parsedImageOrder = parsed.imageOrder; + parsedOffloadedRefs = parsed.offloadedRefs; + } catch (err) { + // MediaOffloadError indicates a server-side storage fault (ENOSPC, EPERM, + // etc.). All other errors are client-side input validation failures. + // Map them to different HTTP status codes so callers can retry server + // faults without treating them as bad requests. + const isServerFault = err instanceof MediaOffloadError; + respond( + false, + undefined, + errorShape( + isServerFault ? ErrorCodes.UNAVAILABLE : ErrorCodes.INVALID_REQUEST, + String(err), + ), + ); + return; + } + } + try { const abortController = new AbortController(); context.chatAbortControllers.set(clientRunId, { @@ -1477,8 +1551,15 @@ export const chatHandlers: GatewayRequestHandlers = { status: "started" as const, }; respond(true, ackPayload, undefined, { runId: clientRunId }); + + // Persist both inline images and already-offloaded refs to the media + // store so that transcript media fields remain complete for all attachment + // sizes. Offloaded refs are already on disk; persistChatSendImages converts + // their metadata without re-writing the files. const persistedImagesPromise = persistChatSendImages({ images: parsedImages, + imageOrder: parsedImageOrder, + offloadedRefs: parsedOffloadedRefs, client, logGateway: context.logGateway, }); @@ -1638,6 +1719,7 @@ export const chatHandlers: GatewayRequestHandlers = { runId: clientRunId, abortSignal: abortController.signal, images: parsedImages.length > 0 ? parsedImages : undefined, + imageOrder: parsedImageOrder.length > 0 ? parsedImageOrder : undefined, onAgentRunStart: (runId) => { agentRunStarted = true; void emitUserTranscriptUpdate(); diff --git a/src/gateway/server-node-events.test.ts b/src/gateway/server-node-events.test.ts index f2220508138..5536806a576 100644 --- a/src/gateway/server-node-events.test.ts +++ b/src/gateway/server-node-events.test.ts @@ -7,6 +7,8 @@ const buildSessionLookup = ( sessionKey: string, entry: { sessionId?: string; + model?: string; + modelProvider?: string; lastChannel?: string; lastTo?: string; lastAccountId?: string; @@ -23,6 +25,8 @@ const buildSessionLookup = ( entry: { sessionId: entry.sessionId ?? `sid-${sessionKey}`, updatedAt: entry.updatedAt ?? Date.now(), + model: entry.model, + modelProvider: entry.modelProvider, lastChannel: entry.lastChannel, lastTo: entry.lastTo, lastAccountId: entry.lastAccountId, @@ -44,6 +48,7 @@ const loadOrCreateDeviceIdentityMock = vi.hoisted(() => privateKeyPem: "private", })), ); +const parseMessageWithAttachmentsMock = vi.hoisted(() => vi.fn()); const normalizeChannelIdMock = vi.hoisted(() => vi.fn((channel?: string | null) => channel ?? null), ); @@ -79,6 +84,13 @@ vi.mock("../infra/push-apns.js", () => ({ vi.mock("../infra/device-identity.js", () => ({ loadOrCreateDeviceIdentity: loadOrCreateDeviceIdentityMock, })); +vi.mock("./chat-attachments.js", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + parseMessageWithAttachments: parseMessageWithAttachmentsMock, + }; +}); vi.mock("./session-utils.js", () => ({ loadSessionEntry: vi.fn((sessionKey: string) => buildSessionLookup(sessionKey)), migrateAndPruneGatewaySessionStoreKey: vi.fn( @@ -89,10 +101,38 @@ vi.mock("./session-utils.js", () => ({ }), ), pruneLegacyStoreKeys: vi.fn(), + resolveGatewayModelSupportsImages: vi.fn( + async ({ + loadGatewayModelCatalog, + provider, + model, + }: { + loadGatewayModelCatalog: () => Promise< + Array<{ id: string; provider: string; input?: string[] }> + >; + provider?: string; + model?: string; + }) => { + if (!model) { + return true; + } + const catalog = await loadGatewayModelCatalog(); + const modelEntry = catalog.find( + (entry) => entry.id === model && (!provider || entry.provider === provider), + ); + return modelEntry ? (modelEntry.input?.includes("image") ?? false) : true; + }, + ), resolveGatewaySessionStoreTarget: vi.fn(({ key }: { key: string }) => ({ canonicalKey: key, storeKeys: [key], })), + resolveSessionModelRef: vi.fn( + (_cfg: OpenClawConfig, entry?: { model?: string; modelProvider?: string }) => ({ + provider: entry?.modelProvider ?? "test-provider", + model: entry?.model ?? "default-model", + }), + ), })); import { normalizeChannelId } from "../channels/plugins/index.js"; @@ -751,10 +791,17 @@ describe("notifications changed events", () => { describe("agent request events", () => { beforeEach(() => { agentCommandMock.mockClear(); + parseMessageWithAttachmentsMock.mockReset(); updateSessionStoreMock.mockClear(); loadSessionEntryMock.mockClear(); normalizeChannelIdVi.mockClear(); normalizeChannelIdVi.mockImplementation((channel?: string | null) => channel ?? null); + parseMessageWithAttachmentsMock.mockResolvedValue({ + message: "parsed message", + images: [], + imageOrder: [], + offloadedRefs: [], + }); agentCommandMock.mockResolvedValue({ status: "ok" } as never); updateSessionStoreMock.mockImplementation(async (_storePath, update) => { update({}); @@ -821,4 +868,45 @@ describe("agent request events", () => { }); expect(opts.runId).toBe(opts.sessionId); }); + + it("passes supportsImages false for text-only node-session models", async () => { + const ctx = buildCtx(); + ctx.loadGatewayModelCatalog = async () => [ + { + id: "text-only", + name: "Text only", + provider: "test-provider", + input: ["text"], + }, + ]; + loadSessionEntryMock.mockReturnValueOnce({ + ...buildSessionLookup("agent:main:main", { + model: "text-only", + modelProvider: "test-provider", + }), + canonicalKey: "agent:main:main", + }); + + await handleNodeEvent(ctx, "node-text-only", { + event: "agent.request", + payloadJSON: JSON.stringify({ + message: "describe", + sessionKey: "agent:main:main", + attachments: [ + { + type: "image", + mimeType: "image/png", + fileName: "dot.png", + content: "AAAA", + }, + ], + }), + }); + + expect(parseMessageWithAttachmentsMock).toHaveBeenCalledWith( + "describe", + expect.any(Array), + expect.objectContaining({ supportsImages: false }), + ); + }); }); diff --git a/src/gateway/server-node-events.ts b/src/gateway/server-node-events.ts index 4b05a82faf9..9a8294339ed 100644 --- a/src/gateway/server-node-events.ts +++ b/src/gateway/server-node-events.ts @@ -1,4 +1,5 @@ import { randomUUID } from "node:crypto"; +import { resolveSessionAgentId } from "../agents/agent-scope.js"; import { sanitizeInboundSystemTags } from "../auto-reply/reply/inbound-text.js"; import { normalizeChannelId } from "../channels/plugins/index.js"; import { createOutboundSendDeps } from "../cli/outbound-send-deps.js"; @@ -12,12 +13,19 @@ import { buildOutboundSessionContext } from "../infra/outbound/session-context.j import { resolveOutboundTarget } from "../infra/outbound/targets.js"; import { registerApnsRegistration } from "../infra/push-apns.js"; import { enqueueSystemEvent } from "../infra/system-events.js"; +import type { PromptImageOrderEntry } from "../media/prompt-image-order.js"; +import { deleteMediaBuffer } from "../media/store.js"; import { normalizeMainKey, scopedHeartbeatWakeOptions } from "../routing/session-key.js"; import { defaultRuntime } from "../runtime.js"; import { parseMessageWithAttachments } from "./chat-attachments.js"; import { normalizeRpcAttachmentsToChatAttachments } from "./server-methods/attachment-normalize.js"; import type { NodeEvent, NodeEventContext } from "./server-node-events-types.js"; -import { loadSessionEntry, migrateAndPruneGatewaySessionStoreKey } from "./session-utils.js"; +import { + loadSessionEntry, + migrateAndPruneGatewaySessionStoreKey, + resolveGatewayModelSupportsImages, + resolveSessionModelRef, +} from "./session-utils.js"; import { formatForLog } from "./ws-log.js"; const MAX_EXEC_EVENT_OUTPUT_CHARS = 180; @@ -347,33 +355,74 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt timeoutSeconds?: number | null; key?: string | null; }; + let link: AgentDeepLink | null = null; try { link = JSON.parse(evt.payloadJSON) as AgentDeepLink; } catch { return; } + + const sessionKeyRaw = (link?.sessionKey ?? "").trim(); + const sessionKey = sessionKeyRaw.length > 0 ? sessionKeyRaw : `node-${nodeId}`; + const cfg = loadConfig(); + const { storePath, entry, canonicalKey } = loadSessionEntry(sessionKey); + let message = (link?.message ?? "").trim(); const normalizedAttachments = normalizeRpcAttachmentsToChatAttachments( link?.attachments ?? undefined, ); let images: Array<{ type: "image"; data: string; mimeType: string }> = []; + let imageOrder: PromptImageOrderEntry[] = []; + if (!message && normalizedAttachments.length === 0) { + return; + } + if (message.length > 20_000) { + return; + } if (normalizedAttachments.length > 0) { + const sessionAgentId = resolveSessionAgentId({ sessionKey, config: cfg }); + const modelRef = resolveSessionModelRef(cfg, entry, sessionAgentId); + const supportsImages = await resolveGatewayModelSupportsImages({ + loadGatewayModelCatalog: ctx.loadGatewayModelCatalog, + provider: modelRef.provider, + model: modelRef.model, + }); try { const parsed = await parseMessageWithAttachments(message, normalizedAttachments, { maxBytes: 5_000_000, log: ctx.logGateway, + supportsImages, }); message = parsed.message.trim(); images = parsed.images; - } catch { + imageOrder = parsed.imageOrder; + if (message.length > 20_000) { + ctx.logGateway.warn( + `agent.request message exceeds limit after attachment parsing (length=${message.length})`, + ); + if (parsed.offloadedRefs && parsed.offloadedRefs.length > 0) { + for (const ref of parsed.offloadedRefs) { + try { + await deleteMediaBuffer(ref.id); + } catch (cleanupErr) { + ctx.logGateway.warn( + `Failed to cleanup orphaned media ${ref.id}: ${cleanupErr instanceof Error ? cleanupErr.message : String(cleanupErr)}`, + ); + } + } + } + return; + } + } catch (err) { + ctx.logGateway.warn( + `agent.request attachment parse failed: ${err instanceof Error ? err.message : String(err)}`, + ); return; } } - if (!message) { - return; - } - if (message.length > 20_000) { + + if (!message && images.length === 0) { return; } @@ -386,10 +435,6 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt const receiptText = receiptTextRaw || "Just received your iOS share + request, working on it."; - const sessionKeyRaw = (link?.sessionKey ?? "").trim(); - const sessionKey = sessionKeyRaw.length > 0 ? sessionKeyRaw : `node-${nodeId}`; - const cfg = loadConfig(); - const { storePath, entry, canonicalKey } = loadSessionEntry(sessionKey); const now = Date.now(); const sessionId = entry?.sessionId ?? randomUUID(); await touchSessionStore({ cfg, sessionKey, storePath, canonicalKey, entry, sessionId, now }); @@ -438,6 +483,7 @@ export const handleNodeEvent = async (ctx: NodeEventContext, nodeId: string, evt runId: sessionId, message, images, + imageOrder, sessionId, sessionKey: canonicalKey, thinking: link?.thinking ?? undefined, diff --git a/src/gateway/session-utils.test.ts b/src/gateway/session-utils.test.ts index cff89904182..6242900a3e7 100644 --- a/src/gateway/session-utils.test.ts +++ b/src/gateway/session-utils.test.ts @@ -22,6 +22,7 @@ import { migrateAndPruneGatewaySessionStoreKey, parseGroupKey, pruneLegacyStoreKeys, + resolveGatewayModelSupportsImages, resolveGatewaySessionStoreTarget, resolveSessionModelIdentityRef, resolveSessionModelRef, @@ -2356,6 +2357,30 @@ describe("listSessionsFromStore subagent metadata", () => { expect(timeout?.status).toBe("timeout"); expect(timeout?.runtimeMs).toBe(0); }); + + test("fails closed when model lookup misses", async () => { + await expect( + resolveGatewayModelSupportsImages({ + model: "gpt-5.4", + provider: "openai", + loadGatewayModelCatalog: async () => [ + { id: "gpt-5.4", name: "GPT-5.4", provider: "other", input: ["text", "image"] }, + ], + }), + ).resolves.toBe(false); + }); + + test("fails closed when model catalog load throws", async () => { + await expect( + resolveGatewayModelSupportsImages({ + model: "gpt-5.4", + provider: "openai", + loadGatewayModelCatalog: async () => { + throw new Error("catalog unavailable"); + }, + }), + ).resolves.toBe(false); + }); }); describe("loadCombinedSessionStoreForGateway includes disk-only agents (#32804)", () => { diff --git a/src/gateway/session-utils.ts b/src/gateway/session-utils.ts index b107026c858..f401724debe 100644 --- a/src/gateway/session-utils.ts +++ b/src/gateway/session-utils.ts @@ -8,6 +8,7 @@ import { } from "../agents/agent-scope.js"; import { lookupContextTokens, resolveContextTokensForModel } from "../agents/context.js"; import { DEFAULT_CONTEXT_TOKENS, DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js"; +import type { ModelCatalogEntry } from "../agents/model-catalog.js"; import { inferUniqueProviderFromConfiguredModels, parseModelRef, @@ -1074,6 +1075,27 @@ export function resolveSessionModelRef( return { provider, model }; } +export async function resolveGatewayModelSupportsImages(params: { + loadGatewayModelCatalog: () => Promise; + provider?: string; + model?: string; +}): Promise { + if (!params.model) { + return true; + } + + try { + const catalog = await params.loadGatewayModelCatalog(); + const modelEntry = catalog.find( + (entry) => + entry.id === params.model && (!params.provider || entry.provider === params.provider), + ); + return modelEntry ? (modelEntry.input?.includes("image") ?? false) : false; + } catch { + return false; + } +} + export function resolveSessionModelIdentityRef( cfg: OpenClawConfig, entry?: diff --git a/src/media/prompt-image-order.ts b/src/media/prompt-image-order.ts new file mode 100644 index 00000000000..97b8763e8ff --- /dev/null +++ b/src/media/prompt-image-order.ts @@ -0,0 +1 @@ +export type PromptImageOrderEntry = "inline" | "offloaded"; diff --git a/src/media/store.ts b/src/media/store.ts index 2bec9b9cd62..85ff587943d 100644 --- a/src/media/store.ts +++ b/src/media/store.ts @@ -407,3 +407,99 @@ export async function saveMediaBuffer( await writeSavedMediaBuffer({ dir, id, buffer }); return buildSavedMediaResult({ dir, id, size: buffer.byteLength, contentType: mime }); } + +/** + * Resolves a media ID saved by saveMediaBuffer to its absolute physical path. + * + * This is the read-side counterpart to saveMediaBuffer and is used by the + * agent runner to hydrate opaque `media://inbound/` URIs written by the + * Gateway's claim-check offload path. + * + * Security: + * - Rejects IDs containing path separators, "..", or null bytes to prevent + * directory traversal and path injection outside the resolved subdir. + * - Verifies the resolved path is a regular file (not a symlink or directory) + * before returning it, matching the write-side MEDIA_FILE_MODE policy. + * + * @param id The media ID as returned by SavedMedia.id (may include + * extension and original-filename prefix, + * e.g. "photo---.png" or "图片---.png"). + * @param subdir The subdirectory the file was saved into (default "inbound"). + * @returns Absolute path to the file on disk. + * @throws If the ID is unsafe, the file does not exist, or is not a + * regular file. + */ +export async function resolveMediaBufferPath( + id: string, + subdir: "inbound" = "inbound", +): Promise { + // Guard against path traversal and null-byte injection. + // + // - Separator checks: reject any ID containing "/" or "\" (covers all + // relative traversal sequences such as "../foo" or "..\\foo"). + // - Exact ".." check: reject the bare traversal operator in case a caller + // strips separators but keeps the dots. + // - Null-byte check: reject "\0" which can truncate paths on some platforms + // and cause the OS to open a different file than intended. + // + // We allow consecutive dots in legitimate filenames (e.g. "report..draft.png"), + // so we only reject the exact two-character string "..". + // + // JSON.stringify is used in the error message so that control characters + // (including \0) are rendered visibly in logs rather than silently dropped. + if (!id || id.includes("/") || id.includes("\\") || id.includes("\0") || id === "..") { + throw new Error(`resolveMediaBufferPath: unsafe media ID: ${JSON.stringify(id)}`); + } + + const dir = path.join(resolveMediaDir(), subdir); + const resolved = path.join(dir, id); + + // Double-check that path.join didn't escape the intended directory. + // This should be unreachable after the separator check above, but be + // explicit about the invariant. + if (!resolved.startsWith(dir + path.sep) && resolved !== dir) { + throw new Error(`resolveMediaBufferPath: path escapes media directory: ${JSON.stringify(id)}`); + } + + // lstat (not stat) so we see symlinks rather than following them. + const stat = await fs.lstat(resolved); + + if (stat.isSymbolicLink()) { + throw new Error( + `resolveMediaBufferPath: refusing to follow symlink for media ID: ${JSON.stringify(id)}`, + ); + } + if (!stat.isFile()) { + throw new Error( + `resolveMediaBufferPath: media ID does not resolve to a file: ${JSON.stringify(id)}`, + ); + } + + return resolved; +} + +/** + * Deletes a file previously saved by saveMediaBuffer. + * + * This is used by parseMessageWithAttachments to clean up files that were + * successfully offloaded earlier in the same request when a later attachment + * fails validation and the entire parse is aborted, preventing orphaned files + * from accumulating on disk ahead of the periodic TTL sweep. + * + * Uses resolveMediaBufferPath to apply the same path-safety guards as the + * read path (separator checks, symlink rejection, etc.) before unlinking. + * + * Errors are intentionally not suppressed — callers that want best-effort + * cleanup should catch and discard exceptions themselves (e.g. via + * Promise.allSettled). + * + * @param id The media ID as returned by SavedMedia.id. + * @param subdir The subdirectory the file was saved into (default "inbound"). + */ +export async function deleteMediaBuffer( + id: string, + subdir: "inbound" = "inbound", +): Promise { + const physicalPath = await resolveMediaBufferPath(id, subdir); + await fs.unlink(physicalPath); +} diff --git a/test/test-env.ts b/test/test-env.ts index e247f396772..f7dd51520ad 100644 --- a/test/test-env.ts +++ b/test/test-env.ts @@ -233,13 +233,16 @@ function sanitizeLiveConfig(raw: string): string { list?: Array>; }; } = JSON5.parse(raw); + if (!parsed || typeof parsed !== "object") { return raw; } + if (parsed.agents?.defaults && typeof parsed.agents.defaults === "object") { delete parsed.agents.defaults.workspace; delete parsed.agents.defaults.agentDir; } + if (Array.isArray(parsed.agents?.list)) { parsed.agents.list = parsed.agents.list.map((entry) => { if (!entry || typeof entry !== "object") { @@ -251,6 +254,7 @@ function sanitizeLiveConfig(raw: string): string { return nextEntry; }); } + return `${JSON.stringify(parsed, null, 2)}\n`; } catch { return raw;