From c4265a5f166f99b19b6bccaf445463640411c4f2 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Sun, 15 Mar 2026 18:10:49 +0530 Subject: [PATCH] fix: preserve Telegram word boundaries when rechunking HTML (#47274) * fix: preserve Telegram chunk word boundaries * fix: address Telegram chunking review feedback * fix: preserve Telegram retry separators * fix: preserve Telegram chunking boundaries (#47274) --- CHANGELOG.md | 1 + extensions/telegram/src/format.ts | 218 +++++++++++++++++- .../telegram/src/format.wrap-md.test.ts | 29 +++ 3 files changed, 242 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd2212d5174..1ffe236664c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai - Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc. - Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411) - Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent. +- Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274) ## 2026.3.13 diff --git a/extensions/telegram/src/format.ts b/extensions/telegram/src/format.ts index 1ccd8f8299b..0c1bec2a62a 100644 --- a/extensions/telegram/src/format.ts +++ b/extensions/telegram/src/format.ts @@ -512,6 +512,146 @@ function sliceLinkSpans( }); } +function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR { + return { + text: ir.text.slice(start, end), + styles: sliceStyleSpans(ir.styles, start, end), + links: sliceLinkSpans(ir.links, start, end), + }; +} + +function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] { + const merged: MarkdownIR["styles"] = []; + for (const span of styles) { + const last = merged.at(-1); + if (last && last.style === span.style && span.start <= last.end) { + last.end = Math.max(last.end, span.end); + continue; + } + merged.push({ ...span }); + } + return merged; +} + +function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] { + const merged: MarkdownIR["links"] = []; + for (const link of links) { + const last = merged.at(-1); + if (last && last.href === link.href && link.start <= last.end) { + last.end = Math.max(last.end, link.end); + continue; + } + merged.push({ ...link }); + } + return merged; +} + +function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR { + const offset = left.text.length; + return { + text: left.text + right.text, + styles: mergeAdjacentStyleSpans([ + ...left.styles, + ...right.styles.map((span) => ({ + ...span, + start: span.start + offset, + end: span.end + offset, + })), + ]), + links: mergeAdjacentLinkSpans([ + ...left.links, + ...right.links.map((link) => ({ + ...link, + start: link.start + offset, + end: link.end + offset, + })), + ]), + }; +} + +function renderTelegramChunkHtml(ir: MarkdownIR): string { + return wrapFileReferencesInHtml(renderTelegramHtml(ir)); +} + +function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number { + const maxEnd = Math.min(text.length, start + limit); + if (maxEnd >= text.length) { + return text.length; + } + + let lastOutsideParenNewlineBreak = -1; + let lastOutsideParenWhitespaceBreak = -1; + let lastOutsideParenWhitespaceRunStart = -1; + let lastAnyNewlineBreak = -1; + let lastAnyWhitespaceBreak = -1; + let lastAnyWhitespaceRunStart = -1; + let parenDepth = 0; + let sawNonWhitespace = false; + + for (let index = start; index < maxEnd; index += 1) { + const char = text[index]; + if (char === "(") { + sawNonWhitespace = true; + parenDepth += 1; + continue; + } + if (char === ")" && parenDepth > 0) { + sawNonWhitespace = true; + parenDepth -= 1; + continue; + } + if (!/\s/.test(char)) { + sawNonWhitespace = true; + continue; + } + if (!sawNonWhitespace) { + continue; + } + if (char === "\n") { + lastAnyNewlineBreak = index + 1; + if (parenDepth === 0) { + lastOutsideParenNewlineBreak = index + 1; + } + continue; + } + const whitespaceRunStart = + index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart; + lastAnyWhitespaceBreak = index + 1; + lastAnyWhitespaceRunStart = whitespaceRunStart; + if (parenDepth === 0) { + lastOutsideParenWhitespaceBreak = index + 1; + lastOutsideParenWhitespaceRunStart = whitespaceRunStart; + } + } + + const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => { + if (breakIndex <= start) { + return breakIndex; + } + if (runStart <= start) { + return breakIndex; + } + return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex; + }; + + if (lastOutsideParenNewlineBreak > start) { + return lastOutsideParenNewlineBreak; + } + if (lastOutsideParenWhitespaceBreak > start) { + return resolveWhitespaceBreak( + lastOutsideParenWhitespaceBreak, + lastOutsideParenWhitespaceRunStart, + ); + } + if (lastAnyNewlineBreak > start) { + return lastAnyNewlineBreak; + } + if (lastAnyWhitespaceBreak > start) { + return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart); + } + return maxEnd; +} + function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] { if (!ir.text) { return []; @@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd const chunks: MarkdownIR[] = []; let cursor = 0; while (cursor < ir.text.length) { - const end = Math.min(ir.text.length, cursor + normalizedLimit); + const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit); chunks.push({ text: ir.text.slice(cursor, end), styles: sliceStyleSpans(ir.styles, cursor, end), @@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd return chunks; } +function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] { + const coalesced: MarkdownIR[] = []; + let index = 0; + + while (index < chunks.length) { + const chunk = chunks[index]; + if (!chunk) { + index += 1; + continue; + } + if (chunk.text.trim().length > 0) { + coalesced.push(chunk); + index += 1; + continue; + } + + const prev = coalesced.at(-1); + const next = chunks[index + 1]; + const chunkLength = chunk.text.length; + + const canMergePrev = (candidate: MarkdownIR) => + renderTelegramChunkHtml(candidate).length <= limit; + const canMergeNext = (candidate: MarkdownIR) => + renderTelegramChunkHtml(candidate).length <= limit; + + if (prev) { + const mergedPrev = mergeMarkdownIRChunks(prev, chunk); + if (canMergePrev(mergedPrev)) { + coalesced[coalesced.length - 1] = mergedPrev; + index += 1; + continue; + } + } + + if (next) { + const mergedNext = mergeMarkdownIRChunks(chunk, next); + if (canMergeNext(mergedNext)) { + chunks[index + 1] = mergedNext; + index += 1; + continue; + } + } + + if (prev && next) { + for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) { + const prefix = sliceMarkdownIR(chunk, 0, prefixLength); + const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength); + const mergedPrev = mergeMarkdownIRChunks(prev, prefix); + const mergedNext = mergeMarkdownIRChunks(suffix, next); + if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) { + coalesced[coalesced.length - 1] = mergedPrev; + chunks[index + 1] = mergedNext; + break; + } + } + } + + index += 1; + } + + return coalesced; +} + function renderTelegramChunksWithinHtmlLimit( ir: MarkdownIR, limit: number, ): TelegramFormattedChunk[] { const normalizedLimit = Math.max(1, Math.floor(limit)); const pending = chunkMarkdownIR(ir, normalizedLimit); - const rendered: TelegramFormattedChunk[] = []; + const finalized: MarkdownIR[] = []; while (pending.length > 0) { const chunk = pending.shift(); if (!chunk) { continue; } - const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk)); + const html = renderTelegramChunkHtml(chunk); if (html.length <= normalizedLimit || chunk.text.length <= 1) { - rendered.push({ html, text: chunk.text }); + finalized.push(chunk); continue; } const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length); if (split.length <= 1) { // Worst-case safety: avoid retry loops, deliver the chunk as-is. - rendered.push({ html, text: chunk.text }); + finalized.push(chunk); continue; } pending.unshift(...split); } - return rendered; + return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({ + html: renderTelegramChunkHtml(chunk), + text: chunk.text, + })); } export function markdownToTelegramChunks( diff --git a/extensions/telegram/src/format.wrap-md.test.ts b/extensions/telegram/src/format.wrap-md.test.ts index 9921b669973..de3cab42056 100644 --- a/extensions/telegram/src/format.wrap-md.test.ts +++ b/extensions/telegram/src/format.wrap-md.test.ts @@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => { expect(chunks.map((chunk) => chunk.text).join("")).toBe(input); expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true); }); + + it("prefers word boundaries when html-limit retry splits formatted prose", () => { + const input = "**Which of these**"; + const chunks = markdownToTelegramChunks(input, 16); + expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]); + expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true); + }); + + it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => { + const input = "**foo (bar baz qux quux**"; + const chunks = markdownToTelegramChunks(input, 20); + expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]); + expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true); + }); + + it("does not emit whitespace-only chunks during html-limit retry splitting", () => { + const input = "**ab <<**"; + const chunks = markdownToTelegramChunks(input, 11); + expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab <<"); + expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true); + expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true); + }); + + it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => { + const input = "ab\n\n<<"; + const chunks = markdownToTelegramChunks(input, 6); + expect(chunks.map((chunk) => chunk.text).join("")).toBe(input); + expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true); + }); }); describe("edge cases", () => {