diff --git a/CHANGELOG.md b/CHANGELOG.md index abbcbb60a43..47cf51dcf08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Docs: https://docs.openclaw.ai - CLI/Config validation: add `openclaw config validate` (with `--json`) to validate config files before gateway startup, and include detailed invalid-key paths in startup invalid-config errors. (#31220) thanks @Sid-Qin. - Sessions/Attachments: add inline file attachment support for `sessions_spawn` (subagent runtime only) with base64/utf8 encoding, transcript content redaction, lifecycle cleanup, and configurable limits via `tools.sessions_spawn.attachments`. (#16761) Thanks @napetrov. - Agents/Thinking defaults: set `adaptive` as the default thinking level for Anthropic Claude 4.6 models (including Bedrock Claude 4.6 refs) while keeping other reasoning-capable models at `low` unless explicitly configured. +- Tools/PDF analysis: add a first-class `pdf` tool with native Anthropic and Google PDF provider support, extraction fallback for non-native models, configurable defaults (`agents.defaults.pdfModel`, `pdfMaxBytesMb`, `pdfMaxPages`), and docs/tests covering routing, validation, and registration. (#31319) Thanks @tyler6204. - Gateway/Container probes: add built-in HTTP liveness/readiness endpoints (`/health`, `/healthz`, `/ready`, `/readyz`) for Docker/Kubernetes health checks, with fallback routing so existing handlers on those paths are not shadowed. (#31272) Thanks @vincentkoc. - Android/Nodes: add `camera.list`, `device.permissions`, `device.health`, and `notifications.actions` (`open`/`dismiss`/`reply`) on Android nodes, plus first-class node-tool actions for the new device/notification commands. (#28260) Thanks @obviyus. - Discord/Thread bindings: replace fixed TTL lifecycle with inactivity (`idleHours`, default 24h) plus optional hard `maxAgeHours` lifecycle controls, and add `/session idle` + `/session max-age` commands for focused thread-bound sessions. (#27845) Thanks @osolmaz. diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index eb6c0843ab3..bdf6fbdb639 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -835,6 +835,12 @@ Time format in system prompt. Default: `auto` (OS preference). primary: "openrouter/qwen/qwen-2.5-vl-72b-instruct:free", fallbacks: ["openrouter/google/gemini-2.0-flash-vision:free"], }, + pdfModel: { + primary: "anthropic/claude-opus-4-6", + fallbacks: ["openai/gpt-5-mini"], + }, + pdfMaxBytesMb: 10, + pdfMaxPages: 20, thinkingDefault: "low", verboseDefault: "off", elevatedDefault: "on", @@ -853,6 +859,11 @@ Time format in system prompt. Default: `auto` (OS preference). - `imageModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the `image` tool path as its vision-model config. - Also used as fallback routing when the selected/default model cannot accept image input. +- `pdfModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). + - Used by the `pdf` tool for model routing. + - If omitted, the PDF tool falls back to `imageModel`, then to best-effort provider defaults. +- `pdfMaxBytesMb`: default PDF size limit for the `pdf` tool when `maxBytesMb` is not passed at call time. +- `pdfMaxPages`: default maximum pages considered by extraction fallback mode in the `pdf` tool. - `model.primary`: format `provider/model` (e.g. `anthropic/claude-opus-4-6`). If you omit the provider, OpenClaw assumes `anthropic` (deprecated). - `models`: the configured model catalog and allowlist for `/model`. Each entry can include `alias` (shortcut) and `params` (provider-specific, for example `temperature`, `maxTokens`, `cacheRetention`, `context1m`). - `params` merge precedence (config): `agents.defaults.models["provider/model"].params` is the base, then `agents.list[].params` (matching agent id) overrides by key. diff --git a/docs/tools/index.md b/docs/tools/index.md index bc17cb0720f..676671a07f6 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -397,6 +397,26 @@ Notes: - Only available when `agents.defaults.imageModel` is configured (primary or fallbacks), or when an implicit image model can be inferred from your default model + configured auth (best-effort pairing). - Uses the image model directly (independent of the main chat model). +### `pdf` + +Analyze one or more PDF documents. + +Core parameters: + +- `pdf` (single path or URL) +- `pdfs` (multiple paths or URLs, up to 10) +- `prompt` (optional, defaults to "Analyze this PDF document.") +- `pages` (optional page range like `1-5` or `1,3,7-9`) +- `model` (optional model override) +- `maxBytesMb` (optional size cap) + +Notes: + +- Native PDF provider mode is supported for Anthropic and Google models. +- Non-native models use PDF extraction fallback, text first, then rasterized page images when needed. +- `pages` filtering is only supported in extraction fallback mode. Native providers return a clear error when `pages` is set. +- Defaults are configurable via `agents.defaults.pdfModel`, `agents.defaults.pdfMaxBytesMb`, and `agents.defaults.pdfMaxPages`. + ### `message` Send messages and channel actions across Discord/Google Chat/Slack/Telegram/WhatsApp/Signal/iMessage/MS Teams. diff --git a/src/agents/model-catalog.ts b/src/agents/model-catalog.ts index ccae3baa18a..a910a10a9f1 100644 --- a/src/agents/model-catalog.ts +++ b/src/agents/model-catalog.ts @@ -5,13 +5,15 @@ import { ensureOpenClawModelsJson } from "./models-config.js"; const log = createSubsystemLogger("model-catalog"); +export type ModelInputType = "text" | "image" | "document"; + export type ModelCatalogEntry = { id: string; name: string; provider: string; contextWindow?: number; reasoning?: boolean; - input?: Array<"text" | "image">; + input?: ModelInputType[]; }; type DiscoveredModel = { @@ -20,7 +22,7 @@ type DiscoveredModel = { provider: string; contextWindow?: number; reasoning?: boolean; - input?: Array<"text" | "image">; + input?: ModelInputType[]; }; type PiSdkModule = typeof import("./pi-model-discovery.js"); @@ -60,12 +62,12 @@ function applyOpenAICodexSparkFallback(models: ModelCatalogEntry[]): void { }); } -function normalizeConfiguredModelInput(input: unknown): Array<"text" | "image"> | undefined { +function normalizeConfiguredModelInput(input: unknown): ModelInputType[] | undefined { if (!Array.isArray(input)) { return undefined; } const normalized = input.filter( - (item): item is "text" | "image" => item === "text" || item === "image", + (item): item is ModelInputType => item === "text" || item === "image" || item === "document", ); return normalized.length > 0 ? normalized : undefined; } @@ -248,6 +250,13 @@ export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boole return entry?.input?.includes("image") ?? false; } +/** + * Check if a model supports native document/PDF input based on its catalog entry. + */ +export function modelSupportsDocument(entry: ModelCatalogEntry | undefined): boolean { + return entry?.input?.includes("document") ?? false; +} + /** * Find a model in the catalog by provider and model ID. */ diff --git a/src/agents/openclaw-tools.pdf-registration.test.ts b/src/agents/openclaw-tools.pdf-registration.test.ts new file mode 100644 index 00000000000..0816c59b8ae --- /dev/null +++ b/src/agents/openclaw-tools.pdf-registration.test.ts @@ -0,0 +1,33 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import "./test-helpers/fast-core-tools.js"; +import { createOpenClawTools } from "./openclaw-tools.js"; + +async function withTempAgentDir(run: (agentDir: string) => Promise): Promise { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-tools-pdf-")); + try { + return await run(agentDir); + } finally { + await fs.rm(agentDir, { recursive: true, force: true }); + } +} + +describe("createOpenClawTools PDF registration", () => { + it("includes pdf tool when pdfModel is configured", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "openai/gpt-5-mini" }, + }, + }, + }; + + const tools = createOpenClawTools({ config: cfg, agentDir }); + expect(tools.some((tool) => tool.name === "pdf")).toBe(true); + }); + }); +}); diff --git a/src/agents/openclaw-tools.ts b/src/agents/openclaw-tools.ts index 9626d68d1af..f0f91a27148 100644 --- a/src/agents/openclaw-tools.ts +++ b/src/agents/openclaw-tools.ts @@ -13,6 +13,7 @@ import { createGatewayTool } from "./tools/gateway-tool.js"; import { createImageTool } from "./tools/image-tool.js"; import { createMessageTool } from "./tools/message-tool.js"; import { createNodesTool } from "./tools/nodes-tool.js"; +import { createPdfTool } from "./tools/pdf-tool.js"; import { createSessionStatusTool } from "./tools/session-status-tool.js"; import { createSessionsHistoryTool } from "./tools/sessions-history-tool.js"; import { createSessionsListTool } from "./tools/sessions-list-tool.js"; @@ -84,6 +85,18 @@ export function createOpenClawTools(options?: { modelHasVision: options?.modelHasVision, }) : null; + const pdfTool = options?.agentDir?.trim() + ? createPdfTool({ + config: options?.config, + agentDir: options.agentDir, + workspaceDir, + sandbox: + options?.sandboxRoot && options?.sandboxFsBridge + ? { root: options.sandboxRoot, bridge: options.sandboxFsBridge } + : undefined, + fsPolicy: options?.fsPolicy, + }) + : null; const webSearchTool = createWebSearchTool({ config: options?.config, sandboxed: options?.sandboxed, @@ -173,6 +186,7 @@ export function createOpenClawTools(options?: { ...(webSearchTool ? [webSearchTool] : []), ...(webFetchTool ? [webFetchTool] : []), ...(imageTool ? [imageTool] : []), + ...(pdfTool ? [pdfTool] : []), ]; const pluginTools = resolvePluginTools({ diff --git a/src/agents/tools/pdf-native-providers.ts b/src/agents/tools/pdf-native-providers.ts new file mode 100644 index 00000000000..36d43ffb9f7 --- /dev/null +++ b/src/agents/tools/pdf-native-providers.ts @@ -0,0 +1,179 @@ +/** + * Direct SDK/HTTP calls for providers that support native PDF document input. + * This bypasses pi-ai's content type system which does not have a "document" type. + */ + +import { isRecord } from "../../utils.js"; +import { normalizeSecretInput } from "../../utils/normalize-secret-input.js"; + +type PdfInput = { + base64: string; + filename?: string; +}; + +// --------------------------------------------------------------------------- +// Anthropic – native PDF via Messages API +// --------------------------------------------------------------------------- + +type AnthropicDocBlock = { + type: "document"; + source: { + type: "base64"; + media_type: "application/pdf"; + data: string; + }; +}; + +type AnthropicTextBlock = { + type: "text"; + text: string; +}; + +type AnthropicContentBlock = AnthropicDocBlock | AnthropicTextBlock; + +type AnthropicResponseContent = Array<{ type: string; text?: string }>; + +export async function anthropicAnalyzePdf(params: { + apiKey: string; + modelId: string; + prompt: string; + pdfs: PdfInput[]; + maxTokens?: number; + baseUrl?: string; +}): Promise { + const apiKey = normalizeSecretInput(params.apiKey); + if (!apiKey) { + throw new Error("Anthropic PDF: apiKey required"); + } + + const content: AnthropicContentBlock[] = []; + for (const pdf of params.pdfs) { + content.push({ + type: "document", + source: { + type: "base64", + media_type: "application/pdf", + data: pdf.base64, + }, + }); + } + content.push({ type: "text", text: params.prompt }); + + const baseUrl = (params.baseUrl ?? "https://api.anthropic.com").replace(/\/+$/, ""); + const res = await fetch(`${baseUrl}/v1/messages`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-api-key": apiKey, + "anthropic-version": "2023-06-01", + "anthropic-beta": "pdfs-2024-09-25", + }, + body: JSON.stringify({ + model: params.modelId, + max_tokens: params.maxTokens ?? 4096, + messages: [{ role: "user", content }], + }), + }); + + if (!res.ok) { + const body = await res.text().catch(() => ""); + throw new Error( + `Anthropic PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`, + ); + } + + const json = (await res.json().catch(() => null)) as unknown; + if (!isRecord(json)) { + throw new Error("Anthropic PDF response was not JSON."); + } + + const responseContent = json.content as AnthropicResponseContent | undefined; + if (!Array.isArray(responseContent)) { + throw new Error("Anthropic PDF response missing content array."); + } + + const text = responseContent + .filter((block) => block.type === "text" && typeof block.text === "string") + .map((block) => block.text!) + .join(""); + + if (!text.trim()) { + throw new Error("Anthropic PDF returned no text."); + } + + return text.trim(); +} + +// --------------------------------------------------------------------------- +// Google Gemini – native PDF via generateContent API +// --------------------------------------------------------------------------- + +type GeminiPart = { inline_data: { mime_type: string; data: string } } | { text: string }; + +type GeminiCandidate = { + content?: { parts?: Array<{ text?: string }> }; +}; + +export async function geminiAnalyzePdf(params: { + apiKey: string; + modelId: string; + prompt: string; + pdfs: PdfInput[]; + baseUrl?: string; +}): Promise { + const apiKey = normalizeSecretInput(params.apiKey); + if (!apiKey) { + throw new Error("Gemini PDF: apiKey required"); + } + + const parts: GeminiPart[] = []; + for (const pdf of params.pdfs) { + parts.push({ + inline_data: { + mime_type: "application/pdf", + data: pdf.base64, + }, + }); + } + parts.push({ text: params.prompt }); + + const baseUrl = (params.baseUrl ?? "https://generativelanguage.googleapis.com").replace( + /\/+$/, + "", + ); + const url = `${baseUrl}/v1beta/models/${encodeURIComponent(params.modelId)}:generateContent?key=${encodeURIComponent(apiKey)}`; + + const res = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ role: "user", parts }], + }), + }); + + if (!res.ok) { + const body = await res.text().catch(() => ""); + throw new Error( + `Gemini PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`, + ); + } + + const json = (await res.json().catch(() => null)) as unknown; + if (!isRecord(json)) { + throw new Error("Gemini PDF response was not JSON."); + } + + const candidates = json.candidates as GeminiCandidate[] | undefined; + if (!Array.isArray(candidates) || candidates.length === 0) { + throw new Error("Gemini PDF returned no candidates."); + } + + const textParts = candidates[0].content?.parts?.filter((p) => typeof p.text === "string") ?? []; + const text = textParts.map((p) => p.text!).join(""); + + if (!text.trim()) { + throw new Error("Gemini PDF returned no text."); + } + + return text.trim(); +} diff --git a/src/agents/tools/pdf-tool.helpers.ts b/src/agents/tools/pdf-tool.helpers.ts new file mode 100644 index 00000000000..4cb5fde9382 --- /dev/null +++ b/src/agents/tools/pdf-tool.helpers.ts @@ -0,0 +1,103 @@ +import type { AssistantMessage } from "@mariozechner/pi-ai"; +import type { OpenClawConfig } from "../../config/config.js"; +import { + resolveAgentModelFallbackValues, + resolveAgentModelPrimaryValue, +} from "../../config/model-input.js"; +import { extractAssistantText } from "../pi-embedded-utils.js"; + +export type PdfModelConfig = { primary?: string; fallbacks?: string[] }; + +/** + * Providers known to support native PDF document input. + * When the model's provider is in this set, the tool sends raw PDF bytes + * via provider-specific API calls instead of extracting text/images first. + */ +export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]); + +/** + * Check whether a provider supports native PDF document input. + */ +export function providerSupportsNativePdf(provider: string): boolean { + return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim()); +} + +/** + * Parse a page range string (e.g. "1-5", "3", "1-3,7-9") into an array of 1-based page numbers. + */ +export function parsePageRange(range: string, maxPages: number): number[] { + const pages = new Set(); + const parts = range.split(",").map((p) => p.trim()); + for (const part of parts) { + if (!part) { + continue; + } + const dashMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part); + if (dashMatch) { + const start = Number(dashMatch[1]); + const end = Number(dashMatch[2]); + if (!Number.isFinite(start) || !Number.isFinite(end) || start < 1 || end < start) { + throw new Error(`Invalid page range: "${part}"`); + } + for (let i = start; i <= Math.min(end, maxPages); i++) { + pages.add(i); + } + } else { + const num = Number(part); + if (!Number.isFinite(num) || num < 1) { + throw new Error(`Invalid page number: "${part}"`); + } + if (num <= maxPages) { + pages.add(num); + } + } + } + return Array.from(pages).toSorted((a, b) => a - b); +} + +export function coercePdfAssistantText(params: { + message: AssistantMessage; + provider: string; + model: string; +}): string { + const stop = params.message.stopReason; + const errorMessage = params.message.errorMessage?.trim(); + if (stop === "error" || stop === "aborted") { + throw new Error( + errorMessage + ? `PDF model failed (${params.provider}/${params.model}): ${errorMessage}` + : `PDF model failed (${params.provider}/${params.model})`, + ); + } + if (errorMessage) { + throw new Error(`PDF model failed (${params.provider}/${params.model}): ${errorMessage}`); + } + const text = extractAssistantText(params.message); + if (text.trim()) { + return text.trim(); + } + throw new Error(`PDF model returned no text (${params.provider}/${params.model}).`); +} + +export function coercePdfModelConfig(cfg?: OpenClawConfig): PdfModelConfig { + const primary = resolveAgentModelPrimaryValue(cfg?.agents?.defaults?.pdfModel); + const fallbacks = resolveAgentModelFallbackValues(cfg?.agents?.defaults?.pdfModel); + return { + ...(primary?.trim() ? { primary: primary.trim() } : {}), + ...(fallbacks.length > 0 ? { fallbacks } : {}), + }; +} + +export function resolvePdfToolMaxTokens( + modelMaxTokens: number | undefined, + requestedMaxTokens = 4096, +) { + if ( + typeof modelMaxTokens !== "number" || + !Number.isFinite(modelMaxTokens) || + modelMaxTokens <= 0 + ) { + return requestedMaxTokens; + } + return Math.min(requestedMaxTokens, modelMaxTokens); +} diff --git a/src/agents/tools/pdf-tool.test.ts b/src/agents/tools/pdf-tool.test.ts new file mode 100644 index 00000000000..6d0b7978762 --- /dev/null +++ b/src/agents/tools/pdf-tool.test.ts @@ -0,0 +1,861 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { OpenClawConfig } from "../../config/config.js"; +import { + coercePdfAssistantText, + coercePdfModelConfig, + parsePageRange, + providerSupportsNativePdf, + resolvePdfToolMaxTokens, +} from "./pdf-tool.helpers.js"; +import { createPdfTool, resolvePdfModelConfigForTool } from "./pdf-tool.js"; + +vi.mock("@mariozechner/pi-ai", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + complete: vi.fn(), + }; +}); + +async function withTempAgentDir(run: (agentDir: string) => Promise): Promise { + const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-")); + try { + return await run(agentDir); + } finally { + await fs.rm(agentDir, { recursive: true, force: true }); + } +} + +// --------------------------------------------------------------------------- +// parsePageRange tests +// --------------------------------------------------------------------------- + +describe("parsePageRange", () => { + it("parses a single page number", () => { + expect(parsePageRange("3", 20)).toEqual([3]); + }); + + it("parses a page range", () => { + expect(parsePageRange("1-5", 20)).toEqual([1, 2, 3, 4, 5]); + }); + + it("parses comma-separated pages and ranges", () => { + expect(parsePageRange("1,3,5-7", 20)).toEqual([1, 3, 5, 6, 7]); + }); + + it("clamps to maxPages", () => { + expect(parsePageRange("1-100", 5)).toEqual([1, 2, 3, 4, 5]); + }); + + it("deduplicates and sorts", () => { + expect(parsePageRange("5,3,1,3,5", 20)).toEqual([1, 3, 5]); + }); + + it("throws on invalid page number", () => { + expect(() => parsePageRange("abc", 20)).toThrow("Invalid page number"); + }); + + it("throws on invalid range (start > end)", () => { + expect(() => parsePageRange("5-3", 20)).toThrow("Invalid page range"); + }); + + it("throws on zero page number", () => { + expect(() => parsePageRange("0", 20)).toThrow("Invalid page number"); + }); + + it("throws on negative page number", () => { + expect(() => parsePageRange("-1", 20)).toThrow("Invalid page number"); + }); + + it("handles empty parts gracefully", () => { + expect(parsePageRange("1,,3", 20)).toEqual([1, 3]); + }); +}); + +// --------------------------------------------------------------------------- +// providerSupportsNativePdf tests +// --------------------------------------------------------------------------- + +describe("providerSupportsNativePdf", () => { + it("returns true for anthropic", () => { + expect(providerSupportsNativePdf("anthropic")).toBe(true); + }); + + it("returns true for google", () => { + expect(providerSupportsNativePdf("google")).toBe(true); + }); + + it("returns false for openai", () => { + expect(providerSupportsNativePdf("openai")).toBe(false); + }); + + it("returns false for minimax", () => { + expect(providerSupportsNativePdf("minimax")).toBe(false); + }); + + it("is case-insensitive", () => { + expect(providerSupportsNativePdf("Anthropic")).toBe(true); + expect(providerSupportsNativePdf("GOOGLE")).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// PDF model config resolution +// --------------------------------------------------------------------------- + +describe("resolvePdfModelConfigForTool", () => { + const priorFetch = global.fetch; + + beforeEach(() => { + vi.stubEnv("OPENAI_API_KEY", ""); + vi.stubEnv("ANTHROPIC_API_KEY", ""); + vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", ""); + vi.stubEnv("GOOGLE_API_KEY", ""); + vi.stubEnv("COPILOT_GITHUB_TOKEN", ""); + vi.stubEnv("GH_TOKEN", ""); + vi.stubEnv("GITHUB_TOKEN", ""); + }); + + afterEach(() => { + vi.unstubAllEnvs(); + global.fetch = priorFetch; + }); + + it("returns null without any auth", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "openai/gpt-5.2" } } }, + }; + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toBeNull(); + }); + }); + + it("prefers explicit pdfModel config", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openai/gpt-5.2" }, + pdfModel: { primary: "anthropic/claude-opus-4-6" }, + }, + }, + } as OpenClawConfig; + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "anthropic/claude-opus-4-6", + }); + }); + }); + + it("falls back to imageModel config when no pdfModel set", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + model: { primary: "openai/gpt-5.2" }, + imageModel: { primary: "openai/gpt-5-mini" }, + }, + }, + }; + expect(resolvePdfModelConfigForTool({ cfg, agentDir })).toEqual({ + primary: "openai/gpt-5-mini", + }); + }); + }); + + it("prefers anthropic when available for native PDF support", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + vi.stubEnv("OPENAI_API_KEY", "openai-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "openai/gpt-5.2" } } }, + }; + const config = resolvePdfModelConfigForTool({ cfg, agentDir }); + expect(config).not.toBeNull(); + // Should prefer anthropic for native PDF + expect(config?.primary).toBe("anthropic/claude-opus-4-6"); + }); + }); + + it("uses anthropic primary when provider is anthropic", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const config = resolvePdfModelConfigForTool({ cfg, agentDir }); + expect(config?.primary).toBe("anthropic/claude-opus-4-6"); + }); + }); +}); + +// --------------------------------------------------------------------------- +// createPdfTool +// --------------------------------------------------------------------------- + +describe("createPdfTool", () => { + const priorFetch = global.fetch; + + beforeEach(() => { + vi.stubEnv("OPENAI_API_KEY", ""); + vi.stubEnv("ANTHROPIC_API_KEY", ""); + vi.stubEnv("ANTHROPIC_OAUTH_TOKEN", ""); + vi.stubEnv("GOOGLE_API_KEY", ""); + vi.stubEnv("COPILOT_GITHUB_TOKEN", ""); + vi.stubEnv("GH_TOKEN", ""); + vi.stubEnv("GITHUB_TOKEN", ""); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.unstubAllEnvs(); + global.fetch = priorFetch; + }); + + it("returns null without agentDir and no explicit config", () => { + expect(createPdfTool()).toBeNull(); + }); + + it("returns null without any auth configured", async () => { + await withTempAgentDir(async (agentDir) => { + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "openai/gpt-5.2" } } }, + }; + expect(createPdfTool({ config: cfg, agentDir })).toBeNull(); + }); + }); + + it("throws when agentDir missing but explicit config present", () => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "anthropic/claude-opus-4-6" }, + }, + }, + } as OpenClawConfig; + expect(() => createPdfTool({ config: cfg })).toThrow("requires agentDir"); + }); + + it("creates tool when auth is available", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + expect(tool?.name).toBe("pdf"); + expect(tool?.label).toBe("PDF"); + expect(tool?.description).toContain("PDF documents"); + }); + }); + + it("rejects when no pdf input provided", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + await expect(tool!.execute("t1", { prompt: "test" })).rejects.toThrow("pdf required"); + }); + }); + + it("rejects too many PDFs", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + const manyPdfs = Array.from({ length: 15 }, (_, i) => `/tmp/doc${i}.pdf`); + const result = await tool!.execute("t1", { prompt: "test", pdfs: manyPdfs }); + expect(result).toMatchObject({ + details: { error: "too_many_pdfs" }, + }); + }); + }); + + it("rejects unsupported scheme references", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + const result = await tool!.execute("t1", { + prompt: "test", + pdf: "ftp://example.com/doc.pdf", + }); + expect(result).toMatchObject({ + details: { error: "unsupported_pdf_reference" }, + }); + }); + }); + + it("deduplicates pdf inputs before loading", async () => { + await withTempAgentDir(async (agentDir) => { + const webMedia = await import("../../web/media.js"); + const loadSpy = vi.spyOn(webMedia, "loadWebMediaRaw").mockResolvedValue({ + kind: "document", + buffer: Buffer.from("%PDF-1.4 fake"), + contentType: "application/pdf", + fileName: "doc.pdf", + } as never); + + const modelDiscovery = await import("../pi-model-discovery.js"); + vi.spyOn(modelDiscovery, "discoverAuthStorage").mockReturnValue({ + setRuntimeApiKey: vi.fn(), + } as never); + vi.spyOn(modelDiscovery, "discoverModels").mockReturnValue({ find: () => null } as never); + + const modelsConfig = await import("../models-config.js"); + vi.spyOn(modelsConfig, "ensureOpenClawModelsJson").mockResolvedValue(undefined); + + const modelAuth = await import("../model-auth.js"); + vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); + vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key"); + + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "anthropic/claude-opus-4-6" }, + }, + }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + + await expect( + tool!.execute("t1", { + prompt: "test", + pdf: "/tmp/nonexistent.pdf", + pdfs: ["/tmp/nonexistent.pdf"], + }), + ).rejects.toThrow("Unknown model"); + + expect(loadSpy).toHaveBeenCalledTimes(1); + }); + }); + + it("uses native PDF path without eager extraction", async () => { + await withTempAgentDir(async (agentDir) => { + const webMedia = await import("../../web/media.js"); + vi.spyOn(webMedia, "loadWebMediaRaw").mockResolvedValue({ + kind: "document", + buffer: Buffer.from("%PDF-1.4 fake"), + contentType: "application/pdf", + fileName: "doc.pdf", + } as never); + + const modelDiscovery = await import("../pi-model-discovery.js"); + vi.spyOn(modelDiscovery, "discoverAuthStorage").mockReturnValue({ + setRuntimeApiKey: vi.fn(), + } as never); + vi.spyOn(modelDiscovery, "discoverModels").mockReturnValue({ + find: () => + ({ + provider: "anthropic", + maxTokens: 8192, + input: ["text", "document"], + }) as never, + } as never); + + const modelsConfig = await import("../models-config.js"); + vi.spyOn(modelsConfig, "ensureOpenClawModelsJson").mockResolvedValue(undefined); + + const modelAuth = await import("../model-auth.js"); + vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); + vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key"); + + const nativeProviders = await import("./pdf-native-providers.js"); + vi.spyOn(nativeProviders, "anthropicAnalyzePdf").mockResolvedValue("native summary"); + + const extractModule = await import("../../media/pdf-extract.js"); + const extractSpy = vi.spyOn(extractModule, "extractPdfContent"); + + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "anthropic/claude-opus-4-6" }, + }, + }, + }; + + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + + const result = await tool!.execute("t1", { + prompt: "summarize", + pdf: "/tmp/doc.pdf", + }); + + expect(extractSpy).not.toHaveBeenCalled(); + expect(result).toMatchObject({ + content: [{ type: "text", text: "native summary" }], + details: { native: true, model: "anthropic/claude-opus-4-6" }, + }); + }); + }); + + it("rejects pages parameter for native PDF providers", async () => { + await withTempAgentDir(async (agentDir) => { + const webMedia = await import("../../web/media.js"); + vi.spyOn(webMedia, "loadWebMediaRaw").mockResolvedValue({ + kind: "document", + buffer: Buffer.from("%PDF-1.4 fake"), + contentType: "application/pdf", + fileName: "doc.pdf", + } as never); + + const modelDiscovery = await import("../pi-model-discovery.js"); + vi.spyOn(modelDiscovery, "discoverAuthStorage").mockReturnValue({ + setRuntimeApiKey: vi.fn(), + } as never); + vi.spyOn(modelDiscovery, "discoverModels").mockReturnValue({ + find: () => + ({ + provider: "anthropic", + maxTokens: 8192, + input: ["text", "document"], + }) as never, + } as never); + + const modelsConfig = await import("../models-config.js"); + vi.spyOn(modelsConfig, "ensureOpenClawModelsJson").mockResolvedValue(undefined); + + const modelAuth = await import("../model-auth.js"); + vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); + vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key"); + + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "anthropic/claude-opus-4-6" }, + }, + }, + }; + + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + + await expect( + tool!.execute("t1", { + prompt: "summarize", + pdf: "/tmp/doc.pdf", + pages: "1-2", + }), + ).rejects.toThrow("pages is not supported with native PDF providers"); + }); + }); + + it("uses extraction fallback for non-native models", async () => { + await withTempAgentDir(async (agentDir) => { + const webMedia = await import("../../web/media.js"); + vi.spyOn(webMedia, "loadWebMediaRaw").mockResolvedValue({ + kind: "document", + buffer: Buffer.from("%PDF-1.4 fake"), + contentType: "application/pdf", + fileName: "doc.pdf", + } as never); + + const modelDiscovery = await import("../pi-model-discovery.js"); + vi.spyOn(modelDiscovery, "discoverAuthStorage").mockReturnValue({ + setRuntimeApiKey: vi.fn(), + } as never); + vi.spyOn(modelDiscovery, "discoverModels").mockReturnValue({ + find: () => + ({ + provider: "openai", + maxTokens: 8192, + input: ["text"], + }) as never, + } as never); + + const modelsConfig = await import("../models-config.js"); + vi.spyOn(modelsConfig, "ensureOpenClawModelsJson").mockResolvedValue(undefined); + + const modelAuth = await import("../model-auth.js"); + vi.spyOn(modelAuth, "getApiKeyForModel").mockResolvedValue({ apiKey: "test-key" } as never); + vi.spyOn(modelAuth, "requireApiKey").mockReturnValue("test-key"); + + const extractModule = await import("../../media/pdf-extract.js"); + const extractSpy = vi.spyOn(extractModule, "extractPdfContent").mockResolvedValue({ + text: "Extracted content", + images: [], + }); + + const piAi = await import("@mariozechner/pi-ai"); + vi.mocked(piAi.complete).mockResolvedValue({ + role: "assistant", + stopReason: "stop", + content: [{ type: "text", text: "fallback summary" }], + } as never); + + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { primary: "openai/gpt-5-mini" }, + }, + }, + }; + + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + + const result = await tool!.execute("t1", { + prompt: "summarize", + pdf: "/tmp/doc.pdf", + }); + + expect(extractSpy).toHaveBeenCalledTimes(1); + expect(result).toMatchObject({ + content: [{ type: "text", text: "fallback summary" }], + details: { native: false, model: "openai/gpt-5-mini" }, + }); + }); + }); + + it("tool parameters have correct schema shape", async () => { + await withTempAgentDir(async (agentDir) => { + vi.stubEnv("ANTHROPIC_API_KEY", "anthropic-test"); + const cfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-opus-4-6" } } }, + }; + const tool = createPdfTool({ config: cfg, agentDir }); + expect(tool).not.toBeNull(); + const schema = tool!.parameters; + expect(schema.type).toBe("object"); + expect(schema.properties).toBeDefined(); + const props = schema.properties as Record; + expect(props.prompt).toBeDefined(); + expect(props.pdf).toBeDefined(); + expect(props.pdfs).toBeDefined(); + expect(props.pages).toBeDefined(); + expect(props.model).toBeDefined(); + expect(props.maxBytesMb).toBeDefined(); + }); + }); +}); + +// --------------------------------------------------------------------------- +// Native provider detection +// --------------------------------------------------------------------------- + +describe("native PDF provider API calls", () => { + const priorFetch = global.fetch; + + afterEach(() => { + global.fetch = priorFetch; + }); + + it("anthropicAnalyzePdf sends correct request shape", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + content: [{ type: "text", text: "Analysis of PDF" }], + }), + }); + global.fetch = fetch; + + const result = await anthropicAnalyzePdf({ + apiKey: "test-key", + modelId: "claude-opus-4-6", + prompt: "Summarize this document", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + maxTokens: 4096, + }); + + expect(result).toBe("Analysis of PDF"); + expect(fetch).toHaveBeenCalledTimes(1); + const [url, opts] = fetch.mock.calls[0]; + expect(url).toContain("/v1/messages"); + const body = JSON.parse(opts.body); + expect(body.model).toBe("claude-opus-4-6"); + expect(body.messages[0].content).toHaveLength(2); + expect(body.messages[0].content[0].type).toBe("document"); + expect(body.messages[0].content[0].source.media_type).toBe("application/pdf"); + expect(body.messages[0].content[1].type).toBe("text"); + }); + + it("anthropicAnalyzePdf throws on API error", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: false, + status: 400, + statusText: "Bad Request", + text: async () => "invalid request", + }); + global.fetch = fetch; + + await expect( + anthropicAnalyzePdf({ + apiKey: "test-key", + modelId: "claude-opus-4-6", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("Anthropic PDF request failed"); + }); + + it("anthropicAnalyzePdf throws when response has no text", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + content: [{ type: "text", text: " " }], + }), + }); + global.fetch = fetch; + + await expect( + anthropicAnalyzePdf({ + apiKey: "test-key", + modelId: "claude-opus-4-6", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("Anthropic PDF returned no text"); + }); + + it("geminiAnalyzePdf sends correct request shape", async () => { + const { geminiAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + candidates: [ + { + content: { parts: [{ text: "Gemini PDF analysis" }] }, + }, + ], + }), + }); + global.fetch = fetch; + + const result = await geminiAnalyzePdf({ + apiKey: "test-key", + modelId: "gemini-2.5-pro", + prompt: "Summarize this", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }); + + expect(result).toBe("Gemini PDF analysis"); + expect(fetch).toHaveBeenCalledTimes(1); + const [url, opts] = fetch.mock.calls[0]; + expect(url).toContain("generateContent"); + expect(url).toContain("gemini-2.5-pro"); + const body = JSON.parse(opts.body); + expect(body.contents[0].parts).toHaveLength(2); + expect(body.contents[0].parts[0].inline_data.mime_type).toBe("application/pdf"); + expect(body.contents[0].parts[1].text).toBe("Summarize this"); + }); + + it("geminiAnalyzePdf throws on API error", async () => { + const { geminiAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: false, + status: 500, + statusText: "Internal Server Error", + text: async () => "server error", + }); + global.fetch = fetch; + + await expect( + geminiAnalyzePdf({ + apiKey: "test-key", + modelId: "gemini-2.5-pro", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("Gemini PDF request failed"); + }); + + it("geminiAnalyzePdf throws when no candidates returned", async () => { + const { geminiAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ candidates: [] }), + }); + global.fetch = fetch; + + await expect( + geminiAnalyzePdf({ + apiKey: "test-key", + modelId: "gemini-2.5-pro", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("Gemini PDF returned no candidates"); + }); + + it("anthropicAnalyzePdf supports multiple PDFs", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + content: [{ type: "text", text: "Multi-doc analysis" }], + }), + }); + global.fetch = fetch; + + await anthropicAnalyzePdf({ + apiKey: "test-key", + modelId: "claude-opus-4-6", + prompt: "Compare these documents", + pdfs: [ + { base64: "cGRmMQ==", filename: "doc1.pdf" }, + { base64: "cGRmMg==", filename: "doc2.pdf" }, + ], + }); + + const body = JSON.parse(fetch.mock.calls[0][1].body); + // 2 document blocks + 1 text block + expect(body.messages[0].content).toHaveLength(3); + expect(body.messages[0].content[0].type).toBe("document"); + expect(body.messages[0].content[1].type).toBe("document"); + expect(body.messages[0].content[2].type).toBe("text"); + }); + + it("anthropicAnalyzePdf uses custom base URL", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + const fetch = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + content: [{ type: "text", text: "ok" }], + }), + }); + global.fetch = fetch; + + await anthropicAnalyzePdf({ + apiKey: "test-key", + modelId: "claude-opus-4-6", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + baseUrl: "https://custom.example.com", + }); + + expect(fetch.mock.calls[0][0]).toContain("https://custom.example.com/v1/messages"); + }); + + it("anthropicAnalyzePdf requires apiKey", async () => { + const { anthropicAnalyzePdf } = await import("./pdf-native-providers.js"); + await expect( + anthropicAnalyzePdf({ + apiKey: "", + modelId: "claude-opus-4-6", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("apiKey required"); + }); + + it("geminiAnalyzePdf requires apiKey", async () => { + const { geminiAnalyzePdf } = await import("./pdf-native-providers.js"); + await expect( + geminiAnalyzePdf({ + apiKey: "", + modelId: "gemini-2.5-pro", + prompt: "test", + pdfs: [{ base64: "dGVzdA==", filename: "doc.pdf" }], + }), + ).rejects.toThrow("apiKey required"); + }); +}); + +// --------------------------------------------------------------------------- +// PDF tool helpers +// --------------------------------------------------------------------------- + +describe("pdf-tool.helpers", () => { + it("resolvePdfToolMaxTokens respects model limit", () => { + expect(resolvePdfToolMaxTokens(2048, 4096)).toBe(2048); + expect(resolvePdfToolMaxTokens(8192, 4096)).toBe(4096); + expect(resolvePdfToolMaxTokens(undefined, 4096)).toBe(4096); + }); + + it("coercePdfModelConfig reads primary and fallbacks", () => { + const cfg: OpenClawConfig = { + agents: { + defaults: { + pdfModel: { + primary: "anthropic/claude-opus-4-6", + fallbacks: ["google/gemini-2.5-pro"], + }, + }, + }, + }; + expect(coercePdfModelConfig(cfg)).toEqual({ + primary: "anthropic/claude-opus-4-6", + fallbacks: ["google/gemini-2.5-pro"], + }); + }); + + it("coercePdfAssistantText returns trimmed text", () => { + const text = coercePdfAssistantText({ + provider: "anthropic", + model: "claude-opus-4-6", + message: { + role: "assistant", + stopReason: "stop", + content: [{ type: "text", text: " summary " }], + } as never, + }); + expect(text).toBe("summary"); + }); + + it("coercePdfAssistantText throws clear error for failed model output", () => { + expect(() => + coercePdfAssistantText({ + provider: "google", + model: "gemini-2.5-pro", + message: { + role: "assistant", + stopReason: "error", + errorMessage: "bad request", + content: [], + } as never, + }), + ).toThrow("PDF model failed (google/gemini-2.5-pro): bad request"); + }); +}); + +// --------------------------------------------------------------------------- +// Model catalog document support +// --------------------------------------------------------------------------- + +describe("model catalog document support", () => { + it("modelSupportsDocument returns true when input includes document", async () => { + const { modelSupportsDocument } = await import("../model-catalog.js"); + expect( + modelSupportsDocument({ + id: "test", + name: "test", + provider: "test", + input: ["text", "document"], + }), + ).toBe(true); + }); + + it("modelSupportsDocument returns false when input lacks document", async () => { + const { modelSupportsDocument } = await import("../model-catalog.js"); + expect( + modelSupportsDocument({ + id: "test", + name: "test", + provider: "test", + input: ["text", "image"], + }), + ).toBe(false); + }); + + it("modelSupportsDocument returns false for undefined entry", async () => { + const { modelSupportsDocument } = await import("../model-catalog.js"); + expect(modelSupportsDocument(undefined)).toBe(false); + }); +}); diff --git a/src/agents/tools/pdf-tool.ts b/src/agents/tools/pdf-tool.ts new file mode 100644 index 00000000000..88ff7db2099 --- /dev/null +++ b/src/agents/tools/pdf-tool.ts @@ -0,0 +1,604 @@ +import { type Api, type Context, complete, type Model } from "@mariozechner/pi-ai"; +import { Type } from "@sinclair/typebox"; +import type { OpenClawConfig } from "../../config/config.js"; +import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js"; +import { resolveUserPath } from "../../utils.js"; +import { getDefaultLocalRoots, loadWebMediaRaw } from "../../web/media.js"; +import { ensureAuthProfileStore, listProfilesForProvider } from "../auth-profiles.js"; +import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js"; +import { getApiKeyForModel, requireApiKey, resolveEnvApiKey } from "../model-auth.js"; +import { runWithImageModelFallback } from "../model-fallback.js"; +import { resolveConfiguredModelRef } from "../model-selection.js"; +import { ensureOpenClawModelsJson } from "../models-config.js"; +import { discoverAuthStorage, discoverModels } from "../pi-model-discovery.js"; +import { + createSandboxBridgeReadFile, + resolveSandboxedBridgeMediaPath, + type SandboxedBridgeMediaPathConfig, +} from "../sandbox-media-paths.js"; +import type { SandboxFsBridge } from "../sandbox/fs-bridge.js"; +import type { ToolFsPolicy } from "../tool-fs-policy.js"; +import { normalizeWorkspaceDir } from "../workspace-dir.js"; +import type { AnyAgentTool } from "./common.js"; +import { + coerceImageModelConfig, + type ImageModelConfig, + resolveProviderVisionModelFromConfig, +} from "./image-tool.helpers.js"; +import { anthropicAnalyzePdf, geminiAnalyzePdf } from "./pdf-native-providers.js"; +import { + coercePdfAssistantText, + coercePdfModelConfig, + parsePageRange, + providerSupportsNativePdf, + resolvePdfToolMaxTokens, +} from "./pdf-tool.helpers.js"; + +const DEFAULT_PROMPT = "Analyze this PDF document."; +const DEFAULT_MAX_PDFS = 10; +const DEFAULT_MAX_BYTES_MB = 10; +const DEFAULT_MAX_PAGES = 20; +const ANTHROPIC_PDF_PRIMARY = "anthropic/claude-opus-4-6"; +const ANTHROPIC_PDF_FALLBACK = "anthropic/claude-opus-4-5"; + +const PDF_MIN_TEXT_CHARS = 200; +const PDF_MAX_PIXELS = 4_000_000; + +// --------------------------------------------------------------------------- +// Model resolution (mirrors image tool pattern) +// --------------------------------------------------------------------------- + +function resolveDefaultModelRef(cfg?: OpenClawConfig): { provider: string; model: string } { + if (cfg) { + const resolved = resolveConfiguredModelRef({ + cfg, + defaultProvider: DEFAULT_PROVIDER, + defaultModel: DEFAULT_MODEL, + }); + return { provider: resolved.provider, model: resolved.model }; + } + return { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL }; +} + +function hasAuthForProvider(params: { provider: string; agentDir: string }): boolean { + if (resolveEnvApiKey(params.provider)?.apiKey) { + return true; + } + const store = ensureAuthProfileStore(params.agentDir, { allowKeychainPrompt: false }); + return listProfilesForProvider(store, params.provider).length > 0; +} + +/** + * Resolve the effective PDF model config. + * Falls back to the image model config, then to provider-specific defaults. + */ +export function resolvePdfModelConfigForTool(params: { + cfg?: OpenClawConfig; + agentDir: string; +}): ImageModelConfig | null { + // Check for explicit PDF model config first + const explicitPdf = coercePdfModelConfig(params.cfg); + if (explicitPdf.primary?.trim() || (explicitPdf.fallbacks?.length ?? 0) > 0) { + return explicitPdf; + } + + // Fall back to the image model config + const explicitImage = coerceImageModelConfig(params.cfg); + if (explicitImage.primary?.trim() || (explicitImage.fallbacks?.length ?? 0) > 0) { + return explicitImage; + } + + // Auto-detect from available providers + const primary = resolveDefaultModelRef(params.cfg); + const anthropicOk = hasAuthForProvider({ provider: "anthropic", agentDir: params.agentDir }); + const googleOk = hasAuthForProvider({ provider: "google", agentDir: params.agentDir }); + const openaiOk = hasAuthForProvider({ provider: "openai", agentDir: params.agentDir }); + + const fallbacks: string[] = []; + const addFallback = (ref: string) => { + const trimmed = ref.trim(); + if (trimmed && !fallbacks.includes(trimmed)) { + fallbacks.push(trimmed); + } + }; + + // Prefer providers with native PDF support + let preferred: string | null = null; + + const providerOk = hasAuthForProvider({ provider: primary.provider, agentDir: params.agentDir }); + const providerVision = resolveProviderVisionModelFromConfig({ + cfg: params.cfg, + provider: primary.provider, + }); + + if (primary.provider === "anthropic" && anthropicOk) { + preferred = ANTHROPIC_PDF_PRIMARY; + } else if (primary.provider === "google" && googleOk && providerVision) { + preferred = providerVision; + } else if (providerOk && providerVision) { + preferred = providerVision; + } else if (anthropicOk) { + preferred = ANTHROPIC_PDF_PRIMARY; + } else if (googleOk) { + preferred = "google/gemini-2.5-pro"; + } else if (openaiOk) { + preferred = "openai/gpt-5-mini"; + } + + if (preferred?.trim()) { + if (anthropicOk && preferred !== ANTHROPIC_PDF_PRIMARY) { + addFallback(ANTHROPIC_PDF_PRIMARY); + } + if (anthropicOk) { + addFallback(ANTHROPIC_PDF_FALLBACK); + } + if (openaiOk) { + addFallback("openai/gpt-5-mini"); + } + const pruned = fallbacks.filter((ref) => ref !== preferred); + return { primary: preferred, ...(pruned.length > 0 ? { fallbacks: pruned } : {}) }; + } + + return null; +} + +// --------------------------------------------------------------------------- +// Build context for extraction fallback path +// --------------------------------------------------------------------------- + +function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context { + const content: Array< + { type: "text"; text: string } | { type: "image"; data: string; mimeType: string } + > = []; + + // Add extracted text and images + for (let i = 0; i < extractions.length; i++) { + const extraction = extractions[i]; + if (extraction.text.trim()) { + const label = extractions.length > 1 ? `[PDF ${i + 1} text]\n` : "[PDF text]\n"; + content.push({ type: "text", text: label + extraction.text }); + } + for (const img of extraction.images) { + content.push({ type: "image", data: img.data, mimeType: img.mimeType }); + } + } + + // Add the user prompt + content.push({ type: "text", text: prompt }); + + return { + messages: [{ role: "user", content, timestamp: Date.now() }], + }; +} + +// --------------------------------------------------------------------------- +// Run PDF prompt with model fallback +// --------------------------------------------------------------------------- + +type PdfSandboxConfig = { + root: string; + bridge: SandboxFsBridge; +}; + +async function runPdfPrompt(params: { + cfg?: OpenClawConfig; + agentDir: string; + pdfModelConfig: ImageModelConfig; + modelOverride?: string; + prompt: string; + pdfBuffers: Array<{ base64: string; filename: string }>; + pageNumbers?: number[]; + getExtractions: () => Promise; +}): Promise<{ + text: string; + provider: string; + model: string; + native: boolean; + attempts: Array<{ provider: string; model: string; error: string }>; +}> { + const effectiveCfg: OpenClawConfig | undefined = params.cfg + ? { + ...params.cfg, + agents: { + ...params.cfg.agents, + defaults: { + ...params.cfg.agents?.defaults, + imageModel: params.pdfModelConfig, + }, + }, + } + : undefined; + + await ensureOpenClawModelsJson(effectiveCfg, params.agentDir); + const authStorage = discoverAuthStorage(params.agentDir); + const modelRegistry = discoverModels(authStorage, params.agentDir); + + let extractionCache: PdfExtractedContent[] | null = null; + const getExtractions = async (): Promise => { + if (!extractionCache) { + extractionCache = await params.getExtractions(); + } + return extractionCache; + }; + + const result = await runWithImageModelFallback({ + cfg: effectiveCfg, + modelOverride: params.modelOverride, + run: async (provider, modelId) => { + const model = modelRegistry.find(provider, modelId) as Model | null; + if (!model) { + throw new Error(`Unknown model: ${provider}/${modelId}`); + } + + const apiKeyInfo = await getApiKeyForModel({ + model, + cfg: effectiveCfg, + agentDir: params.agentDir, + }); + const apiKey = requireApiKey(apiKeyInfo, model.provider); + authStorage.setRuntimeApiKey(model.provider, apiKey); + + if (providerSupportsNativePdf(provider)) { + if (params.pageNumbers && params.pageNumbers.length > 0) { + throw new Error( + `pages is not supported with native PDF providers (${provider}/${modelId}). Remove pages, or use a non-native model for page filtering.`, + ); + } + + const pdfs = params.pdfBuffers.map((p) => ({ + base64: p.base64, + filename: p.filename, + })); + + if (provider === "anthropic") { + const text = await anthropicAnalyzePdf({ + apiKey, + modelId, + prompt: params.prompt, + pdfs, + maxTokens: resolvePdfToolMaxTokens(model.maxTokens), + baseUrl: model.baseUrl, + }); + return { text, provider, model: modelId, native: true }; + } + + if (provider === "google") { + const text = await geminiAnalyzePdf({ + apiKey, + modelId, + prompt: params.prompt, + pdfs, + baseUrl: model.baseUrl, + }); + return { text, provider, model: modelId, native: true }; + } + } + + const extractions = await getExtractions(); + const hasImages = extractions.some((e) => e.images.length > 0); + if (hasImages && !model.input?.includes("image")) { + const hasText = extractions.some((e) => e.text.trim().length > 0); + if (!hasText) { + throw new Error( + `Model ${provider}/${modelId} does not support images and PDF has no extractable text.`, + ); + } + const textOnlyExtractions: PdfExtractedContent[] = extractions.map((e) => ({ + text: e.text, + images: [], + })); + const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions); + const message = await complete(model, context, { + apiKey, + maxTokens: resolvePdfToolMaxTokens(model.maxTokens), + }); + const text = coercePdfAssistantText({ message, provider, model: modelId }); + return { text, provider, model: modelId, native: false }; + } + + const context = buildPdfExtractionContext(params.prompt, extractions); + const message = await complete(model, context, { + apiKey, + maxTokens: resolvePdfToolMaxTokens(model.maxTokens), + }); + const text = coercePdfAssistantText({ message, provider, model: modelId }); + return { text, provider, model: modelId, native: false }; + }, + }); + + return { + text: result.result.text, + provider: result.result.provider, + model: result.result.model, + native: result.result.native, + attempts: result.attempts.map((a) => ({ + provider: a.provider, + model: a.model, + error: a.error, + })), + }; +} + +// --------------------------------------------------------------------------- +// PDF tool factory +// --------------------------------------------------------------------------- + +export function createPdfTool(options?: { + config?: OpenClawConfig; + agentDir?: string; + workspaceDir?: string; + sandbox?: PdfSandboxConfig; + fsPolicy?: ToolFsPolicy; +}): AnyAgentTool | null { + const agentDir = options?.agentDir?.trim(); + if (!agentDir) { + const explicit = coercePdfModelConfig(options?.config); + if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) { + throw new Error("createPdfTool requires agentDir when enabled"); + } + return null; + } + + const pdfModelConfig = resolvePdfModelConfigForTool({ cfg: options?.config, agentDir }); + if (!pdfModelConfig) { + return null; + } + + const maxBytesMbDefault = ( + options?.config?.agents?.defaults as Record | undefined + )?.pdfMaxBytesMb; + const maxPagesDefault = (options?.config?.agents?.defaults as Record | undefined) + ?.pdfMaxPages; + const configuredMaxBytesMb = + typeof maxBytesMbDefault === "number" && Number.isFinite(maxBytesMbDefault) + ? maxBytesMbDefault + : DEFAULT_MAX_BYTES_MB; + const configuredMaxPages = + typeof maxPagesDefault === "number" && Number.isFinite(maxPagesDefault) + ? Math.floor(maxPagesDefault) + : DEFAULT_MAX_PAGES; + + const localRoots = (() => { + const roots = getDefaultLocalRoots(); + const workspaceDir = normalizeWorkspaceDir(options?.workspaceDir); + if (!workspaceDir) { + return roots; + } + return Array.from(new Set([...roots, workspaceDir])); + })(); + + const description = + "Analyze one or more PDF documents with a model. Supports native PDF analysis for Anthropic and Google models, with text/image extraction fallback for other providers. Use pdf for a single path/URL, or pdfs for multiple (up to 10). Provide a prompt describing what to analyze."; + + return { + label: "PDF", + name: "pdf", + description, + parameters: Type.Object({ + prompt: Type.Optional(Type.String()), + pdf: Type.Optional(Type.String({ description: "Single PDF path or URL." })), + pdfs: Type.Optional( + Type.Array(Type.String(), { + description: "Multiple PDF paths or URLs (up to 10).", + }), + ), + pages: Type.Optional( + Type.String({ + description: 'Page range to process, e.g. "1-5", "1,3,5-7". Defaults to all pages.', + }), + ), + model: Type.Optional(Type.String()), + maxBytesMb: Type.Optional(Type.Number()), + }), + execute: async (_toolCallId, args) => { + const record = args && typeof args === "object" ? (args as Record) : {}; + + // MARK: - Normalize pdf + pdfs input + const pdfCandidates: string[] = []; + if (typeof record.pdf === "string") { + pdfCandidates.push(record.pdf); + } + if (Array.isArray(record.pdfs)) { + pdfCandidates.push(...record.pdfs.filter((v): v is string => typeof v === "string")); + } + + const seenPdfs = new Set(); + const pdfInputs: string[] = []; + for (const candidate of pdfCandidates) { + const trimmed = candidate.trim(); + if (!trimmed || seenPdfs.has(trimmed)) { + continue; + } + seenPdfs.add(trimmed); + pdfInputs.push(trimmed); + } + if (pdfInputs.length === 0) { + throw new Error("pdf required: provide a path or URL to a PDF document"); + } + + // Enforce max PDFs cap + if (pdfInputs.length > DEFAULT_MAX_PDFS) { + return { + content: [ + { + type: "text", + text: `Too many PDFs: ${pdfInputs.length} provided, maximum is ${DEFAULT_MAX_PDFS}. Please reduce the number.`, + }, + ], + details: { error: "too_many_pdfs", count: pdfInputs.length, max: DEFAULT_MAX_PDFS }, + }; + } + + const promptRaw = + typeof record.prompt === "string" && record.prompt.trim() + ? record.prompt.trim() + : DEFAULT_PROMPT; + const modelOverride = + typeof record.model === "string" && record.model.trim() ? record.model.trim() : undefined; + const maxBytesMbRaw = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined; + const maxBytesMb = + typeof maxBytesMbRaw === "number" && Number.isFinite(maxBytesMbRaw) && maxBytesMbRaw > 0 + ? maxBytesMbRaw + : configuredMaxBytesMb; + const maxBytes = Math.floor(maxBytesMb * 1024 * 1024); + + // Parse page range + const pagesRaw = + typeof record.pages === "string" && record.pages.trim() ? record.pages.trim() : undefined; + + const sandboxConfig: SandboxedBridgeMediaPathConfig | null = + options?.sandbox && options.sandbox.root.trim() + ? { + root: options.sandbox.root.trim(), + bridge: options.sandbox.bridge, + workspaceOnly: options.fsPolicy?.workspaceOnly === true, + } + : null; + + // MARK: - Load each PDF + const loadedPdfs: Array<{ + base64: string; + buffer: Buffer; + filename: string; + resolvedPath: string; + rewrittenFrom?: string; + }> = []; + + for (const pdfRaw of pdfInputs) { + const trimmed = pdfRaw.trim(); + const isHttpUrl = /^https?:\/\//i.test(trimmed); + const isFileUrl = /^file:/i.test(trimmed); + const isDataUrl = /^data:/i.test(trimmed); + const looksLikeWindowsDrive = /^[a-zA-Z]:[\\/]/.test(trimmed); + const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(trimmed); + + if (hasScheme && !looksLikeWindowsDrive && !isFileUrl && !isHttpUrl && !isDataUrl) { + return { + content: [ + { + type: "text", + text: `Unsupported PDF reference: ${pdfRaw}. Use a file path, file:// URL, or http(s) URL.`, + }, + ], + details: { error: "unsupported_pdf_reference", pdf: pdfRaw }, + }; + } + + if (sandboxConfig && isHttpUrl) { + throw new Error("Sandboxed PDF tool does not allow remote URLs."); + } + + const resolvedPdf = (() => { + if (sandboxConfig) { + return trimmed; + } + if (trimmed.startsWith("~")) { + return resolveUserPath(trimmed); + } + return trimmed; + })(); + + const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = sandboxConfig + ? await resolveSandboxedBridgeMediaPath({ + sandbox: sandboxConfig, + mediaPath: resolvedPdf, + inboundFallbackDir: "media/inbound", + }) + : { + resolved: resolvedPdf.startsWith("file://") + ? resolvedPdf.slice("file://".length) + : resolvedPdf, + }; + + const media = sandboxConfig + ? await loadWebMediaRaw(resolvedPathInfo.resolved, { + maxBytes, + sandboxValidated: true, + readFile: createSandboxBridgeReadFile({ sandbox: sandboxConfig }), + }) + : await loadWebMediaRaw(resolvedPathInfo.resolved, { + maxBytes, + localRoots, + }); + + if (media.kind !== "document") { + // Check MIME type more specifically + const ct = (media.contentType ?? "").toLowerCase(); + if (!ct.includes("pdf") && !ct.includes("application/pdf")) { + throw new Error(`Expected PDF but got ${media.contentType ?? media.kind}: ${pdfRaw}`); + } + } + + const base64 = media.buffer.toString("base64"); + const filename = + media.fileName ?? + (isHttpUrl + ? (new URL(trimmed).pathname.split("/").pop() ?? "document.pdf") + : "document.pdf"); + + loadedPdfs.push({ + base64, + buffer: media.buffer, + filename, + resolvedPath: resolvedPathInfo.resolved, + ...(resolvedPathInfo.rewrittenFrom + ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } + : {}), + }); + } + + const pageNumbers = pagesRaw ? parsePageRange(pagesRaw, configuredMaxPages) : undefined; + + const getExtractions = async (): Promise => { + const extractedAll: PdfExtractedContent[] = []; + for (const pdf of loadedPdfs) { + const extracted = await extractPdfContent({ + buffer: pdf.buffer, + maxPages: configuredMaxPages, + maxPixels: PDF_MAX_PIXELS, + minTextChars: PDF_MIN_TEXT_CHARS, + pageNumbers, + }); + extractedAll.push(extracted); + } + return extractedAll; + }; + + const result = await runPdfPrompt({ + cfg: options?.config, + agentDir, + pdfModelConfig, + modelOverride, + prompt: promptRaw, + pdfBuffers: loadedPdfs.map((p) => ({ base64: p.base64, filename: p.filename })), + pageNumbers, + getExtractions, + }); + + const pdfDetails = + loadedPdfs.length === 1 + ? { + pdf: loadedPdfs[0].resolvedPath, + ...(loadedPdfs[0].rewrittenFrom + ? { rewrittenFrom: loadedPdfs[0].rewrittenFrom } + : {}), + } + : { + pdfs: loadedPdfs.map((p) => ({ + pdf: p.resolvedPath, + ...(p.rewrittenFrom ? { rewrittenFrom: p.rewrittenFrom } : {}), + })), + }; + + return { + content: [{ type: "text", text: result.text }], + details: { + model: `${result.provider}/${result.model}`, + native: result.native, + ...pdfDetails, + attempts: result.attempts, + }, + }; + }, + }; +} diff --git a/src/config/config.schema-regressions.test.ts b/src/config/config.schema-regressions.test.ts index c183b34fa8e..3a04d720714 100644 --- a/src/config/config.schema-regressions.test.ts +++ b/src/config/config.schema-regressions.test.ts @@ -116,6 +116,40 @@ describe("config schema regressions", () => { expect(res.ok).toBe(true); }); + it("accepts pdf default model and limits", () => { + const res = validateConfigObject({ + agents: { + defaults: { + pdfModel: { + primary: "anthropic/claude-opus-4-6", + fallbacks: ["openai/gpt-5-mini"], + }, + pdfMaxBytesMb: 12, + pdfMaxPages: 25, + }, + }, + }); + + expect(res.ok).toBe(true); + }); + + it("rejects non-positive pdf limits", () => { + const res = validateConfigObject({ + agents: { + defaults: { + pdfModel: { primary: "openai/gpt-5-mini" }, + pdfMaxBytesMb: 0, + pdfMaxPages: 0, + }, + }, + }); + + expect(res.ok).toBe(false); + if (!res.ok) { + expect(res.issues.some((issue) => issue.path.includes("agents.defaults.pdfMax"))).toBe(true); + } + }); + it("rejects relative iMessage attachment roots", () => { const res = validateConfigObject({ channels: { diff --git a/src/config/schema.help.ts b/src/config/schema.help.ts index 0c44947a4bf..9b940da0f40 100644 --- a/src/config/schema.help.ts +++ b/src/config/schema.help.ts @@ -922,6 +922,13 @@ export const FIELD_HELP: Record = { "agents.defaults.imageModel.primary": "Optional image model (provider/model) used when the primary model lacks image input.", "agents.defaults.imageModel.fallbacks": "Ordered fallback image models (provider/model).", + "agents.defaults.pdfModel.primary": + "Optional PDF model (provider/model) for the PDF analysis tool. Defaults to imageModel, then session model.", + "agents.defaults.pdfModel.fallbacks": "Ordered fallback PDF models (provider/model).", + "agents.defaults.pdfMaxBytesMb": + "Maximum PDF file size in megabytes for the PDF tool (default: 10).", + "agents.defaults.pdfMaxPages": + "Maximum number of PDF pages to process for the PDF tool (default: 20).", "agents.defaults.imageMaxDimensionPx": "Max image side length in pixels when sanitizing transcript/tool-result image payloads (default: 1200).", "agents.defaults.cliBackends": "Optional CLI backends for text-only fallback (claude-cli, etc.).", diff --git a/src/config/schema.labels.ts b/src/config/schema.labels.ts index a8a83ecc1b0..83cbbe27b7f 100644 --- a/src/config/schema.labels.ts +++ b/src/config/schema.labels.ts @@ -405,6 +405,10 @@ export const FIELD_LABELS: Record = { "agents.defaults.model.fallbacks": "Model Fallbacks", "agents.defaults.imageModel.primary": "Image Model", "agents.defaults.imageModel.fallbacks": "Image Model Fallbacks", + "agents.defaults.pdfModel.primary": "PDF Model", + "agents.defaults.pdfModel.fallbacks": "PDF Model Fallbacks", + "agents.defaults.pdfMaxBytesMb": "PDF Max Size (MB)", + "agents.defaults.pdfMaxPages": "PDF Max Pages", "agents.defaults.imageMaxDimensionPx": "Image Max Dimension (px)", "agents.defaults.humanDelay.mode": "Human Delay Mode", "agents.defaults.humanDelay.minMs": "Human Delay Min (ms)", diff --git a/src/config/types.agent-defaults.ts b/src/config/types.agent-defaults.ts index 7a7526948cc..209961da045 100644 --- a/src/config/types.agent-defaults.ts +++ b/src/config/types.agent-defaults.ts @@ -122,6 +122,12 @@ export type AgentDefaultsConfig = { model?: AgentModelConfig; /** Optional image-capable model and fallbacks (provider/model). Accepts string or {primary,fallbacks}. */ imageModel?: AgentModelConfig; + /** Optional PDF-capable model and fallbacks (provider/model). Accepts string or {primary,fallbacks}. */ + pdfModel?: AgentModelConfig; + /** Maximum PDF file size in megabytes (default: 10). */ + pdfMaxBytesMb?: number; + /** Maximum number of PDF pages to process (default: 20). */ + pdfMaxPages?: number; /** Model catalog with optional aliases (full provider/model keys). */ models?: Record; /** Agent working directory (preferred). Used as the default cwd for agent runs. */ diff --git a/src/config/zod-schema.agent-defaults.ts b/src/config/zod-schema.agent-defaults.ts index e2381093492..0f0f2d408e9 100644 --- a/src/config/zod-schema.agent-defaults.ts +++ b/src/config/zod-schema.agent-defaults.ts @@ -18,6 +18,9 @@ export const AgentDefaultsSchema = z .object({ model: AgentModelSchema.optional(), imageModel: AgentModelSchema.optional(), + pdfModel: AgentModelSchema.optional(), + pdfMaxBytesMb: z.number().positive().optional(), + pdfMaxPages: z.number().int().positive().optional(), models: z .record( z.string(), diff --git a/src/media/input-files.ts b/src/media/input-files.ts index b6d2aa837aa..79d8fa1b862 100644 --- a/src/media/input-files.ts +++ b/src/media/input-files.ts @@ -2,44 +2,10 @@ import { fetchWithSsrFGuard } from "../infra/net/fetch-guard.js"; import type { SsrFPolicy } from "../infra/net/ssrf.js"; import { logWarn } from "../logger.js"; import { canonicalizeBase64, estimateBase64DecodedBytes } from "./base64.js"; +import { extractPdfContent, type PdfExtractedImage } from "./pdf-extract.js"; import { readResponseWithLimit } from "./read-response-with-limit.js"; -type CanvasModule = typeof import("@napi-rs/canvas"); -type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs"); - -let canvasModulePromise: Promise | null = null; -let pdfJsModulePromise: Promise | null = null; - -// Lazy-load optional PDF/image deps so non-PDF paths don't require native installs. -async function loadCanvasModule(): Promise { - if (!canvasModulePromise) { - canvasModulePromise = import("@napi-rs/canvas").catch((err) => { - canvasModulePromise = null; - throw new Error( - `Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`, - ); - }); - } - return canvasModulePromise; -} - -async function loadPdfJsModule(): Promise { - if (!pdfJsModulePromise) { - pdfJsModulePromise = import("pdfjs-dist/legacy/build/pdf.mjs").catch((err) => { - pdfJsModulePromise = null; - throw new Error( - `Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`, - ); - }); - } - return pdfJsModulePromise; -} - -export type InputImageContent = { - type: "image"; - data: string; - mimeType: string; -}; +export type InputImageContent = PdfExtractedImage; export type InputFileExtractResult = { filename: string; @@ -241,65 +207,6 @@ function clampText(text: string, maxChars: number): string { return text.slice(0, maxChars); } -async function extractPdfContent(params: { - buffer: Buffer; - limits: InputFileLimits; -}): Promise<{ text: string; images: InputImageContent[] }> { - const { buffer, limits } = params; - const { getDocument } = await loadPdfJsModule(); - const pdf = await getDocument({ - data: new Uint8Array(buffer), - disableWorker: true, - }).promise; - const maxPages = Math.min(pdf.numPages, limits.pdf.maxPages); - const textParts: string[] = []; - - for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) { - const page = await pdf.getPage(pageNum); - const textContent = await page.getTextContent(); - const pageText = textContent.items - .map((item) => ("str" in item ? String(item.str) : "")) - .filter(Boolean) - .join(" "); - if (pageText) { - textParts.push(pageText); - } - } - - const text = textParts.join("\n\n"); - if (text.trim().length >= limits.pdf.minTextChars) { - return { text, images: [] }; - } - - let canvasModule: CanvasModule; - try { - canvasModule = await loadCanvasModule(); - } catch (err) { - logWarn(`media: PDF image extraction skipped; ${String(err)}`); - return { text, images: [] }; - } - const { createCanvas } = canvasModule; - const images: InputImageContent[] = []; - for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) { - const page = await pdf.getPage(pageNum); - const viewport = page.getViewport({ scale: 1 }); - const maxPixels = limits.pdf.maxPixels; - const pixelBudget = Math.max(1, maxPixels); - const pagePixels = viewport.width * viewport.height; - const scale = Math.min(1, Math.sqrt(pixelBudget / pagePixels)); - const scaled = page.getViewport({ scale: Math.max(0.1, scale) }); - const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height)); - await page.render({ - canvas: canvas as unknown as HTMLCanvasElement, - viewport: scaled, - }).promise; - const png = canvas.toBuffer("image/png"); - images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" }); - } - - return { text, images }; -} - export async function extractImageContentFromSource( source: InputImageSource, limits: InputImageLimits, @@ -409,7 +316,15 @@ export async function extractFileContentFromSource(params: { } if (mimeType === "application/pdf") { - const extracted = await extractPdfContent({ buffer, limits }); + const extracted = await extractPdfContent({ + buffer, + maxPages: limits.pdf.maxPages, + maxPixels: limits.pdf.maxPixels, + minTextChars: limits.pdf.minTextChars, + onImageExtractionError: (err) => { + logWarn(`media: PDF image extraction skipped, ${String(err)}`); + }, + }); const text = extracted.text ? clampText(extracted.text, limits.maxChars) : ""; return { filename, diff --git a/src/media/pdf-extract.ts b/src/media/pdf-extract.ts new file mode 100644 index 00000000000..cf5e66bd994 --- /dev/null +++ b/src/media/pdf-extract.ts @@ -0,0 +1,104 @@ +type CanvasModule = typeof import("@napi-rs/canvas"); +type PdfJsModule = typeof import("pdfjs-dist/legacy/build/pdf.mjs"); + +let canvasModulePromise: Promise | null = null; +let pdfJsModulePromise: Promise | null = null; + +async function loadCanvasModule(): Promise { + if (!canvasModulePromise) { + canvasModulePromise = import("@napi-rs/canvas").catch((err) => { + canvasModulePromise = null; + throw new Error( + `Optional dependency @napi-rs/canvas is required for PDF image extraction: ${String(err)}`, + ); + }); + } + return canvasModulePromise; +} + +async function loadPdfJsModule(): Promise { + if (!pdfJsModulePromise) { + pdfJsModulePromise = import("pdfjs-dist/legacy/build/pdf.mjs").catch((err) => { + pdfJsModulePromise = null; + throw new Error( + `Optional dependency pdfjs-dist is required for PDF extraction: ${String(err)}`, + ); + }); + } + return pdfJsModulePromise; +} + +export type PdfExtractedImage = { + type: "image"; + data: string; + mimeType: string; +}; + +export type PdfExtractedContent = { + text: string; + images: PdfExtractedImage[]; +}; + +export async function extractPdfContent(params: { + buffer: Buffer; + maxPages: number; + maxPixels: number; + minTextChars: number; + pageNumbers?: number[]; + onImageExtractionError?: (error: unknown) => void; +}): Promise { + const { buffer, maxPages, maxPixels, minTextChars, pageNumbers, onImageExtractionError } = params; + const { getDocument } = await loadPdfJsModule(); + const pdf = await getDocument({ data: new Uint8Array(buffer), disableWorker: true }).promise; + + const effectivePages: number[] = pageNumbers + ? pageNumbers.filter((p) => p >= 1 && p <= pdf.numPages).slice(0, maxPages) + : Array.from({ length: Math.min(pdf.numPages, maxPages) }, (_, i) => i + 1); + + const textParts: string[] = []; + for (const pageNum of effectivePages) { + const page = await pdf.getPage(pageNum); + const textContent = await page.getTextContent(); + const pageText = textContent.items + .map((item) => ("str" in item ? String(item.str) : "")) + .filter(Boolean) + .join(" "); + if (pageText) { + textParts.push(pageText); + } + } + + const text = textParts.join("\n\n"); + if (text.trim().length >= minTextChars) { + return { text, images: [] }; + } + + let canvasModule: CanvasModule; + try { + canvasModule = await loadCanvasModule(); + } catch (err) { + onImageExtractionError?.(err); + return { text, images: [] }; + } + + const { createCanvas } = canvasModule; + const images: PdfExtractedImage[] = []; + const pixelBudget = Math.max(1, maxPixels); + + for (const pageNum of effectivePages) { + const page = await pdf.getPage(pageNum); + const viewport = page.getViewport({ scale: 1 }); + const pagePixels = viewport.width * viewport.height; + const scale = Math.min(1, Math.sqrt(pixelBudget / Math.max(1, pagePixels))); + const scaled = page.getViewport({ scale: Math.max(0.1, scale) }); + const canvas = createCanvas(Math.ceil(scaled.width), Math.ceil(scaled.height)); + await page.render({ + canvas: canvas as unknown as HTMLCanvasElement, + viewport: scaled, + }).promise; + const png = canvas.toBuffer("image/png"); + images.push({ type: "image", data: png.toString("base64"), mimeType: "image/png" }); + } + + return { text, images }; +}