fix(telegram): split long messages at word boundaries instead of mid-word (#56595)

Replace proportional text estimate with binary search for the largest text prefix whose rendered Telegram HTML fits the character limit, then split at the last whitespace boundary within that verified prefix. Single words longer than the limit still hard-split (unavoidable). Markdown formatting stays balanced across split points. Fixes #36644
2026-03-28 21:24:59 +01:00 · 2026-03-28 21:24:59 +01:00 · ab2ef7bbfc
parent 865160e572
commit ab2ef7bbfc
2 changed files with 73 additions and 16 deletions
--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@ -433,28 +433,21 @@ export function splitTelegramHtmlChunks(html: string, limit: number): string[] {
  return chunks.length > 0 ? chunks : [html];
 }

-function splitTelegramChunkByHtmlLimit(
-  chunk: MarkdownIR,
-  htmlLimit: number,
-  renderedHtmlLength: number,
-): MarkdownIR[] {
+function splitTelegramChunkByHtmlLimit(chunk: MarkdownIR, htmlLimit: number): MarkdownIR[] {
  const currentTextLength = chunk.text.length;
  if (currentTextLength <= 1) {
    return [chunk];
  }
-  const proportionalLimit = Math.floor(
-    (currentTextLength * htmlLimit) / Math.max(renderedHtmlLength, 1),
-  );
-  const candidateLimit = Math.min(currentTextLength - 1, proportionalLimit);
-  const splitLimit =
-    Number.isFinite(candidateLimit) && candidateLimit > 0
-      ? candidateLimit
-      : Math.max(1, Math.floor(currentTextLength / 2));
+  const splitLimit = findLargestTelegramChunkTextLengthWithinHtmlLimit(chunk, htmlLimit);
+  if (splitLimit <= 0) {
+    return [chunk];
+  }
  const split = splitMarkdownIRPreserveWhitespace(chunk, splitLimit);
-  if (split.length > 1) {
+  const firstChunk = split[0];
+  if (firstChunk && renderTelegramChunkHtml(firstChunk).length <= htmlLimit) {
    return split;
  }
-  return splitMarkdownIRPreserveWhitespace(chunk, Math.max(1, Math.floor(currentTextLength / 2)));
+  return [sliceMarkdownIR(chunk, 0, splitLimit), sliceMarkdownIR(chunk, splitLimit, currentTextLength)];
 }

 function sliceStyleSpans(
@ -554,6 +547,26 @@ function renderTelegramChunkHtml(ir: MarkdownIR): string {
  return wrapFileReferencesInHtml(renderTelegramHtml(ir));
 }

+function findLargestTelegramChunkTextLengthWithinHtmlLimit(
+  chunk: MarkdownIR,
+  htmlLimit: number,
+): number {
+  const currentTextLength = chunk.text.length;
+  if (currentTextLength <= 1) {
+    return currentTextLength;
+  }
+
+  // Prefix HTML length is not monotonic because a sliced auto-link can render as
+  // a long <a ...> fragment, while a longer completed file ref de-linkifies to
+  // a shorter <code>...</code> wrapper. Search exact candidates instead.
+  for (let candidateLength = currentTextLength - 1; candidateLength >= 1; candidateLength -= 1) {
+    if (renderTelegramChunkHtml(sliceMarkdownIR(chunk, 0, candidateLength)).length <= htmlLimit) {
+      return candidateLength;
+    }
+  }
+  return 0;
+}
+
 function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
  const maxEnd = Math.min(text.length, start + limit);
  if (maxEnd >= text.length) {
@ -735,7 +748,7 @@ function renderTelegramChunksWithinHtmlLimit(
      finalized.push(chunk);
      continue;
    }
-    const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
+    const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit);
    if (split.length <= 1) {
      // Worst-case safety: avoid retry loops, deliver the chunk as-is.
      finalized.push(chunk);
--- a/extensions/telegram/src/format.wrap-md.test.ts
+++ b/extensions/telegram/src/format.wrap-md.test.ts
@ -175,6 +175,14 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
    expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
  });

+  it("prefers word boundaries when escaped html shrinks the retry window", () => {
+    const input = "alpha <<";
+    const chunks = markdownToTelegramChunks(input, 8);
+    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
+    expect(chunks[0]?.text).toBe("alpha ");
+    expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true);
+  });
+
  it("prefers word boundaries when html-limit retry splits formatted prose", () => {
    const input = "**Which of these**";
    const chunks = markdownToTelegramChunks(input, 16);
@ -182,6 +190,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
    expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
  });

+  it("preserves formatting while splitting at word boundaries", () => {
+    const input = "**alpha <<**";
+    const chunks = markdownToTelegramChunks(input, 13);
+    expect(chunks.map((chunk) => chunk.text).join("")).toBe("alpha <<");
+    expect(chunks[0]?.text).toBe("alpha ");
+    expect(chunks.every((chunk) => chunk.html.length <= 13)).toBe(true);
+    expect(chunks.every((chunk) => chunk.html.startsWith("<b>") && chunk.html.endsWith("</b>"))).toBe(
+      true,
+    );
+  });
+
+  it("does not rely on monotonic html length for sliced file refs", () => {
+    const input = "README.md<";
+    const chunks = markdownToTelegramChunks(input, 22);
+    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
+    expect(chunks[0]?.text).toBe("README.md");
+    expect(chunks[0]?.html).toBe("<code>README.md</code>");
+    expect(chunks.every((chunk) => chunk.html.length <= 22)).toBe(true);
+  });
+
+  it("gracefully returns the original chunk when tag overhead exceeds the limit", () => {
+    const input = "**ab**";
+    expect(() => markdownToTelegramChunks(input, 6)).not.toThrow();
+    const chunks = markdownToTelegramChunks(input, 6);
+    expect(chunks).toHaveLength(1);
+    expect(chunks[0]?.text).toBe("ab");
+    expect(chunks[0]?.html).toBe("<b>ab</b>");
+  });
+
  it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
    const input = "**foo (bar baz qux quux**";
    const chunks = markdownToTelegramChunks(input, 20);
@ -189,6 +226,13 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
    expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
  });

+  it("falls back to hard splits when a single word exceeds the limit", () => {
+    const input = "supercalifragilistic";
+    const chunks = markdownToTelegramChunks(input, 8);
+    expect(chunks.map((chunk) => chunk.text)).toEqual(["supercal", "ifragili", "stic"]);
+    expect(chunks.every((chunk) => chunk.html.length <= 8)).toBe(true);
+  });
+
  it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
    const input = "**ab  <<**";
    const chunks = markdownToTelegramChunks(input, 11);