diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 8bebb8e2039..1fb8ae240ea 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -41,7 +41,7 @@ describe("applyMediaUnderstanding", () => { mockedResolveApiKey.mockClear(); mockedFetchRemoteMedia.mockReset(); mockedFetchRemoteMedia.mockResolvedValue({ - buffer: Buffer.from("audio-bytes"), + buffer: Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), contentType: "audio/ogg", fileName: "note.ogg", }); @@ -51,7 +51,7 @@ describe("applyMediaUnderstanding", () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPath = path.join(dir, "note.ogg"); - await fs.writeFile(audioPath, "hello"); + await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8])); const ctx: MsgContext = { Body: "", @@ -94,7 +94,7 @@ describe("applyMediaUnderstanding", () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPath = path.join(dir, "note.ogg"); - await fs.writeFile(audioPath, "hello"); + await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8])); const ctx: MsgContext = { Body: " /capture status", @@ -176,7 +176,7 @@ describe("applyMediaUnderstanding", () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPath = path.join(dir, "large.wav"); - await fs.writeFile(audioPath, "0123456789"); + await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])); const ctx: MsgContext = { Body: "", @@ -211,7 +211,7 @@ describe("applyMediaUnderstanding", () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPath = path.join(dir, "note.ogg"); - await fs.writeFile(audioPath, "hello"); + await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8])); const ctx: MsgContext = { Body: "", @@ -352,7 +352,7 @@ describe("applyMediaUnderstanding", () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPath = path.join(dir, "fallback.ogg"); - await fs.writeFile(audioPath, "hello"); + await fs.writeFile(audioPath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6])); const ctx: MsgContext = { Body: "", @@ -390,8 +390,8 @@ describe("applyMediaUnderstanding", () => { const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); const audioPathA = path.join(dir, "note-a.ogg"); const audioPathB = path.join(dir, "note-b.ogg"); - await fs.writeFile(audioPathA, "hello"); - await fs.writeFile(audioPathB, "world"); + await fs.writeFile(audioPathA, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208])); + await fs.writeFile(audioPathB, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208])); const ctx: MsgContext = { Body: "", @@ -435,7 +435,7 @@ describe("applyMediaUnderstanding", () => { const audioPath = path.join(dir, "note.ogg"); const videoPath = path.join(dir, "clip.mp4"); await fs.writeFile(imagePath, "image-bytes"); - await fs.writeFile(audioPath, "audio-bytes"); + await fs.writeFile(audioPath, Buffer.from([200, 201, 202, 203, 204, 205, 206, 207, 208])); await fs.writeFile(videoPath, "video-bytes"); const ctx: MsgContext = { @@ -487,4 +487,63 @@ describe("applyMediaUnderstanding", () => { expect(ctx.CommandBody).toBe("audio ok"); expect(ctx.BodyForCommands).toBe("audio ok"); }); + + it("treats text-like audio attachments as CSV (comma wins over tabs)", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); + const csvPath = path.join(dir, "data.mp3"); + const csvText = '"a","b"\t"c"\n"1","2"\t"3"'; + const csvBuffer = Buffer.concat([Buffer.from([0xff, 0xfe]), Buffer.from(csvText, "utf16le")]); + await fs.writeFile(csvPath, csvBuffer); + + const ctx: MsgContext = { + Body: "", + MediaPath: csvPath, + MediaType: "audio/mpeg", + }; + const cfg: MoltbotConfig = { + tools: { + media: { + audio: { enabled: false }, + image: { enabled: false }, + video: { enabled: false }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + expect(result.appliedFile).toBe(true); + expect(ctx.Body).toContain(''); + expect(ctx.Body).toContain('"a","b"\t"c"'); + }); + + it("infers TSV when tabs are present without commas", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "moltbot-media-")); + const tsvPath = path.join(dir, "report.mp3"); + const tsvText = "a\tb\tc\n1\t2\t3"; + await fs.writeFile(tsvPath, tsvText); + + const ctx: MsgContext = { + Body: "", + MediaPath: tsvPath, + MediaType: "audio/mpeg", + }; + const cfg: MoltbotConfig = { + tools: { + media: { + audio: { enabled: false }, + image: { enabled: false }, + video: { enabled: false }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ ctx, cfg }); + + expect(result.appliedFile).toBe(true); + expect(ctx.Body).toContain(''); + expect(ctx.Body).toContain("a\tb\tc"); + }); }); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index dab640789d6..e95d272418c 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -1,6 +1,22 @@ +import path from "node:path"; + import type { MoltbotConfig } from "../config/config.js"; import type { MsgContext } from "../auto-reply/templating.js"; import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js"; +import { logVerbose, shouldLogVerbose } from "../globals.js"; +import { + DEFAULT_INPUT_FILE_MAX_BYTES, + DEFAULT_INPUT_FILE_MAX_CHARS, + DEFAULT_INPUT_FILE_MIMES, + DEFAULT_INPUT_MAX_REDIRECTS, + DEFAULT_INPUT_PDF_MAX_PAGES, + DEFAULT_INPUT_PDF_MAX_PIXELS, + DEFAULT_INPUT_PDF_MIN_TEXT_CHARS, + DEFAULT_INPUT_TIMEOUT_MS, + extractFileContentFromSource, + normalizeMimeList, + normalizeMimeType, +} from "../media/input-files.js"; import { extractMediaUserText, formatAudioTranscripts, @@ -14,6 +30,7 @@ import type { } from "./types.js"; import { runWithConcurrency } from "./concurrency.js"; import { resolveConcurrency } from "./resolve.js"; +import { resolveAttachmentKind } from "./attachments.js"; import { type ActiveMediaModel, buildProviderRegistry, @@ -28,9 +45,255 @@ export type ApplyMediaUnderstandingResult = { appliedImage: boolean; appliedAudio: boolean; appliedVideo: boolean; + appliedFile: boolean; }; const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +const EXTRA_TEXT_MIMES = [ + "application/xml", + "text/xml", + "application/x-yaml", + "text/yaml", + "application/yaml", + "application/javascript", + "text/javascript", + "text/tab-separated-values", +]; +const TEXT_EXT_MIME = new Map([ + [".csv", "text/csv"], + [".tsv", "text/tab-separated-values"], + [".txt", "text/plain"], + [".md", "text/markdown"], + [".log", "text/plain"], + [".ini", "text/plain"], + [".cfg", "text/plain"], + [".conf", "text/plain"], + [".env", "text/plain"], + [".json", "application/json"], + [".yaml", "text/yaml"], + [".yml", "text/yaml"], + [".xml", "application/xml"], +]); + +function resolveFileLimits(cfg: MoltbotConfig) { + const files = cfg.gateway?.http?.endpoints?.responses?.files; + return { + allowUrl: files?.allowUrl ?? true, + allowedMimes: normalizeMimeList(files?.allowedMimes, DEFAULT_INPUT_FILE_MIMES), + maxBytes: files?.maxBytes ?? DEFAULT_INPUT_FILE_MAX_BYTES, + maxChars: files?.maxChars ?? DEFAULT_INPUT_FILE_MAX_CHARS, + maxRedirects: files?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS, + timeoutMs: files?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS, + pdf: { + maxPages: files?.pdf?.maxPages ?? DEFAULT_INPUT_PDF_MAX_PAGES, + maxPixels: files?.pdf?.maxPixels ?? DEFAULT_INPUT_PDF_MAX_PIXELS, + minTextChars: files?.pdf?.minTextChars ?? DEFAULT_INPUT_PDF_MIN_TEXT_CHARS, + }, + }; +} + +function appendFileBlocks(body: string | undefined, blocks: string[]): string { + if (!blocks || blocks.length === 0) { + return body ?? ""; + } + const base = typeof body === "string" ? body.trim() : ""; + const suffix = blocks.join("\n\n").trim(); + if (!base) { + return suffix; + } + return `${base}\n\n${suffix}`.trim(); +} + +function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined { + if (!buffer || buffer.length < 2) return undefined; + const b0 = buffer[0]; + const b1 = buffer[1]; + if (b0 === 0xff && b1 === 0xfe) { + return "utf-16le"; + } + if (b0 === 0xfe && b1 === 0xff) { + return "utf-16be"; + } + const sampleLen = Math.min(buffer.length, 2048); + let zeroCount = 0; + for (let i = 0; i < sampleLen; i += 1) { + if (buffer[i] === 0) zeroCount += 1; + } + if (zeroCount / sampleLen > 0.2) { + return "utf-16le"; + } + return undefined; +} + +function looksLikeUtf8Text(buffer?: Buffer): boolean { + if (!buffer || buffer.length === 0) return false; + const sampleLen = Math.min(buffer.length, 4096); + let printable = 0; + let other = 0; + for (let i = 0; i < sampleLen; i += 1) { + const byte = buffer[i]; + if (byte === 0) { + other += 1; + continue; + } + if (byte === 9 || byte === 10 || byte === 13 || (byte >= 32 && byte <= 126)) { + printable += 1; + } else { + other += 1; + } + } + const total = printable + other; + if (total === 0) return false; + return printable / total > 0.85; +} + +function decodeTextSample(buffer?: Buffer): string { + if (!buffer || buffer.length === 0) return ""; + const sample = buffer.subarray(0, Math.min(buffer.length, 8192)); + const utf16Charset = resolveUtf16Charset(sample); + if (utf16Charset === "utf-16be") { + const swapped = Buffer.alloc(sample.length); + for (let i = 0; i + 1 < sample.length; i += 2) { + swapped[i] = sample[i + 1]; + swapped[i + 1] = sample[i]; + } + return new TextDecoder("utf-16le").decode(swapped); + } + if (utf16Charset === "utf-16le") { + return new TextDecoder("utf-16le").decode(sample); + } + return new TextDecoder("utf-8").decode(sample); +} + +function guessDelimitedMime(text: string): string | undefined { + if (!text) return undefined; + const line = text.split(/\r?\n/)[0] ?? ""; + const tabs = (line.match(/\t/g) ?? []).length; + const commas = (line.match(/,/g) ?? []).length; + if (commas > 0) { + return "text/csv"; + } + if (tabs > 0) { + return "text/tab-separated-values"; + } + return undefined; +} + +function resolveTextMimeFromName(name?: string): string | undefined { + if (!name) return undefined; + const ext = path.extname(name).toLowerCase(); + return TEXT_EXT_MIME.get(ext); +} + +async function extractFileBlocks(params: { + attachments: ReturnType; + cache: ReturnType; + limits: ReturnType; +}): Promise { + const { attachments, cache, limits } = params; + if (!attachments || attachments.length === 0) { + return []; + } + const blocks: string[] = []; + for (const attachment of attachments) { + if (!attachment) { + continue; + } + const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? ""); + const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment); + if (!forcedTextMime && (kind === "image" || kind === "video")) { + continue; + } + if (!limits.allowUrl && attachment.url && !attachment.path) { + if (shouldLogVerbose()) { + logVerbose(`media: file attachment skipped (url disabled) index=${attachment.index}`); + } + continue; + } + let bufferResult: Awaited>; + try { + bufferResult = await cache.getBuffer({ + attachmentIndex: attachment.index, + maxBytes: limits.maxBytes, + timeoutMs: limits.timeoutMs, + }); + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`media: file attachment skipped (buffer): ${String(err)}`); + } + continue; + } + const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url; + const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? ""); + const utf16Charset = resolveUtf16Charset(bufferResult?.buffer); + const textSample = decodeTextSample(bufferResult?.buffer); + const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer); + if (!forcedTextMimeResolved && kind === "audio" && !textLike) { + continue; + } + const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined; + const textHint = + forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined); + const rawMime = bufferResult?.mime ?? attachment.mime; + const mimeType = textHint ?? normalizeMimeType(rawMime); + if (!mimeType) { + if (shouldLogVerbose()) { + logVerbose(`media: file attachment skipped (unknown mime) index=${attachment.index}`); + } + continue; + } + const allowedMimes = new Set(limits.allowedMimes); + for (const extra of EXTRA_TEXT_MIMES) { + allowedMimes.add(extra); + } + if (mimeType.startsWith("text/")) { + allowedMimes.add(mimeType); + } + if (!allowedMimes.has(mimeType)) { + if (shouldLogVerbose()) { + logVerbose( + `media: file attachment skipped (unsupported mime ${mimeType}) index=${attachment.index}`, + ); + } + continue; + } + let extracted: Awaited>; + try { + const mediaType = utf16Charset ? `${mimeType}; charset=${utf16Charset}` : mimeType; + extracted = await extractFileContentFromSource({ + source: { + type: "base64", + data: bufferResult.buffer.toString("base64"), + mediaType, + filename: bufferResult.fileName, + }, + limits: { + ...limits, + allowedMimes, + }, + }); + } catch (err) { + if (shouldLogVerbose()) { + logVerbose(`media: file attachment skipped (extract): ${String(err)}`); + } + continue; + } + const text = extracted?.text?.trim() ?? ""; + let blockText = text; + if (!blockText) { + if (extracted?.images && extracted.images.length > 0) { + blockText = "[PDF content rendered to images; images not forwarded to model]"; + } else { + blockText = "[No extractable text]"; + } + } + const safeName = (bufferResult.fileName ?? `file-${attachment.index + 1}`) + .replace(/[\r\n\t]+/g, " ") + .trim(); + blocks.push(`\n${blockText}\n`); + } + return blocks; +} export async function applyMediaUnderstanding(params: { ctx: MsgContext; @@ -51,6 +314,12 @@ export async function applyMediaUnderstanding(params: { const cache = createMediaAttachmentCache(attachments); try { + const fileBlocks = await extractFileBlocks({ + attachments, + cache, + limits: resolveFileLimits(cfg), + }); + const tasks = CAPABILITY_ORDER.map((capability) => async () => { const config = cfg.tools?.media?.[capability]; return await runCapability({ @@ -99,7 +368,15 @@ export async function applyMediaUnderstanding(params: { ctx.RawBody = originalUserText; } ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs]; - finalizeInboundContext(ctx, { forceBodyForAgent: true, forceBodyForCommands: true }); + } + if (fileBlocks.length > 0) { + ctx.Body = appendFileBlocks(ctx.Body, fileBlocks); + } + if (outputs.length > 0 || fileBlocks.length > 0) { + finalizeInboundContext(ctx, { + forceBodyForAgent: true, + forceBodyForCommands: outputs.length > 0, + }); } return { @@ -108,6 +385,7 @@ export async function applyMediaUnderstanding(params: { appliedImage: outputs.some((output) => output.kind === "image.description"), appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), appliedVideo: outputs.some((output) => output.kind === "video.description"), + appliedFile: fileBlocks.length > 0, }; } finally { await cache.cleanup(); diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts index 4f45f999708..c6d731d2e72 100644 --- a/src/telegram/bot/delivery.ts +++ b/src/telegram/bot/delivery.ts @@ -310,7 +310,14 @@ export async function resolveMedia( fetchImpl, filePathHint: file.file_path, }); - const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes); + const originalName = fetched.fileName ?? file.file_path; + const saved = await saveMediaBuffer( + fetched.buffer, + fetched.contentType, + "inbound", + maxBytes, + originalName, + ); // Check sticker cache for existing description const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null; @@ -377,7 +384,14 @@ export async function resolveMedia( fetchImpl, filePathHint: file.file_path, }); - const saved = await saveMediaBuffer(fetched.buffer, fetched.contentType, "inbound", maxBytes); + const originalName = fetched.fileName ?? file.file_path; + const saved = await saveMediaBuffer( + fetched.buffer, + fetched.contentType, + "inbound", + maxBytes, + originalName, + ); let placeholder = ""; if (msg.photo) placeholder = ""; else if (msg.video) placeholder = ""; diff --git a/src/telegram/download.ts b/src/telegram/download.ts index 1b3c61e2240..31f431db023 100644 --- a/src/telegram/download.ts +++ b/src/telegram/download.ts @@ -40,7 +40,7 @@ export async function downloadTelegramFile( filePath: info.file_path, }); // save with inbound subdir - const saved = await saveMediaBuffer(array, mime, "inbound", maxBytes); + const saved = await saveMediaBuffer(array, mime, "inbound", maxBytes, info.file_path); // Ensure extension matches mime if possible if (!saved.contentType && mime) saved.contentType = mime; return saved;