From ab2ef7bbfc5c93e2fb7904efb51d10d9cc2fa5a7 Mon Sep 17 00:00:00 2001 From: Robin Waslander Date: Sat, 28 Mar 2026 21:24:59 +0100 Subject: [PATCH] fix(telegram): split long messages at word boundaries instead of mid-word (#56595) Replace proportional text estimate with binary search for the largest text prefix whose rendered Telegram HTML fits the character limit, then split at the last whitespace boundary within that verified prefix. Single words longer than the limit still hard-split (unavoidable). Markdown formatting stays balanced across split points. Fixes #36644 --- extensions/telegram/src/format.ts | 45 ++++++++++++------- .../telegram/src/format.wrap-md.test.ts | 44 ++++++++++++++++++ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/extensions/telegram/src/format.ts b/extensions/telegram/src/format.ts index 591e4c35a84..3fbe1224359 100644 --- a/extensions/telegram/src/format.ts +++ b/extensions/telegram/src/format.ts @@ -433,28 +433,21 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] { return chunks.length > 0 ? chunks : [html]; } -function splitTelegramChunkByHtmlLimit( - chunk: MarkdownIR, - htmlLimit: number, - renderedHtmlLength: number, -): MarkdownIR[] { +function splitTelegramChunkByHtmlLimit(chunk: MarkdownIR, htmlLimit: number): MarkdownIR[] { const currentTextLength = chunk.text.length; if (currentTextLength <= 1) { return [chunk]; } - const proportionalLimit = Math.floor( - (currentTextLength * htmlLimit) / Math.max(renderedHtmlLength, 1), - ); - const candidateLimit = Math.min(currentTextLength - 1, proportionalLimit); - const splitLimit = - Number.isFinite(candidateLimit) && candidateLimit > 0 - ? candidateLimit - : Math.max(1, Math.floor(currentTextLength / 2)); + const splitLimit = findLargestTelegramChunkTextLengthWithinHtmlLimit(chunk, htmlLimit); + if (splitLimit <= 0) { + return [chunk]; + } const split = splitMarkdownIRPreserveWhitespace(chunk, splitLimit); - if (split.length > 1) { + const firstChunk = split[0]; + if (firstChunk && renderTelegramChunkHtml(firstChunk).length <= htmlLimit) { return split; } - return splitMarkdownIRPreserveWhitespace(chunk, Math.max(1, Math.floor(currentTextLength / 2))); + return [sliceMarkdownIR(chunk, 0, splitLimit), sliceMarkdownIR(chunk, splitLimit, currentTextLength)]; } function sliceStyleSpans( @@ -554,6 +547,26 @@ function renderTelegramChunkHtml(ir: MarkdownIR): string { return wrapFileReferencesInHtml(renderTelegramHtml(ir)); } +function findLargestTelegramChunkTextLengthWithinHtmlLimit( + chunk: MarkdownIR, + htmlLimit: number, +): number { + const currentTextLength = chunk.text.length; + if (currentTextLength <= 1) { + return currentTextLength; + } + + // Prefix HTML length is not monotonic because a sliced auto-link can render as + // a long fragment, while a longer completed file ref de-linkifies to + // a shorter ... wrapper. Search exact candidates instead. + for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) { + if (renderTelegramChunkHtml(sliceMarkdownIR(chunk, 0, candidateLength)).length <= htmlLimit) { + return candidateLength; + } + } + return 0; +} + function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number { const maxEnd = Math.min(text.length, start + limit); if (maxEnd >= text.length) { @@ -735,7 +748,7 @@ function renderTelegramChunksWithinHtmlLimit( finalized.push(chunk); continue; } - const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length); + const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit); if (split.length <= 1) { // Worst-case safety: avoid retry loops, deliver the chunk as-is. finalized.push(chunk); diff --git a/extensions/telegram/src/format.wrap-md.test.ts b/extensions/telegram/src/format.wrap-md.test.ts index de3cab42056..f8cdbda85ac 100644 --- a/extensions/telegram/src/format.wrap-md.test.ts +++ b/extensions/telegram/src/format.wrap-md.test.ts @@ -175,6 +175,14 @@ describe("markdownToTelegramChunks - file reference wrapping", () => { expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true); }); + it("prefers word boundaries when escaped html shrinks the retry window", () => { + const input = "alpha <<"; + const chunks = markdownToTelegramChunks(input, 8); + expect(chunks.map((chunk) => chunk.text).join("")).toBe(input); + expect(chunks[0]?.text).toBe("alpha "); + expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true); + }); + it("prefers word boundaries when html-limit retry splits formatted prose", () => { const input = "**Which of these**"; const chunks = markdownToTelegramChunks(input, 16); @@ -182,6 +190,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => { expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true); }); + it("preserves formatting while splitting at word boundaries", () => { + const input = "**alpha <<**"; + const chunks = markdownToTelegramChunks(input, 13); + expect(chunks.map((chunk) => chunk.text).join("")).toBe("alpha <<"); + expect(chunks[0]?.text).toBe("alpha "); + expect(chunks.every((chunk) => chunk.html.length <= 13)).toBe(true); + expect(chunks.every((chunk) => chunk.html.startsWith("") && chunk.html.endsWith(""))).toBe( + true, + ); + }); + + it("does not rely on monotonic html length for sliced file refs", () => { + const input = "README.md<"; + const chunks = markdownToTelegramChunks(input, 22); + expect(chunks.map((chunk) => chunk.text).join("")).toBe(input); + expect(chunks[0]?.text).toBe("README.md"); + expect(chunks[0]?.html).toBe("README.md"); + expect(chunks.every((chunk) => chunk.html.length <= 22)).toBe(true); + }); + + it("gracefully returns the original chunk when tag overhead exceeds the limit", () => { + const input = "**ab**"; + expect(() => markdownToTelegramChunks(input, 6)).not.toThrow(); + const chunks = markdownToTelegramChunks(input, 6); + expect(chunks).toHaveLength(1); + expect(chunks[0]?.text).toBe("ab"); + expect(chunks[0]?.html).toBe("ab"); + }); + it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => { const input = "**foo (bar baz qux quux**"; const chunks = markdownToTelegramChunks(input, 20); @@ -189,6 +226,13 @@ describe("markdownToTelegramChunks - file reference wrapping", () => { expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true); }); + it("falls back to hard splits when a single word exceeds the limit", () => { + const input = "supercalifragilistic"; + const chunks = markdownToTelegramChunks(input, 8); + expect(chunks.map((chunk) => chunk.text)).toEqual(["supercal", "ifragili", "stic"]); + expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true); + }); + it("does not emit whitespace-only chunks during html-limit retry splitting", () => { const input = "**ab <<**"; const chunks = markdownToTelegramChunks(input, 11);