fix: preserve Telegram word boundaries when rechunking HTML (#47274)

* fix: preserve Telegram chunk word boundaries * fix: address Telegram chunking review feedback * fix: preserve Telegram retry separators * fix: preserve Telegram chunking boundaries (#47274)
2026-03-15 18:10:49 +05:30 · 2026-03-15 18:10:49 +05:30 · c4265a5f16
parent 26e0a3ee9a
commit c4265a5f16
3 changed files with 242 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
 - Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
 - Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
+- Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274)

 ## 2026.3.13

--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@ -512,6 +512,146 @@ function sliceLinkSpans(
  });
 }

+function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR {
+  return {
+    text: ir.text.slice(start, end),
+    styles: sliceStyleSpans(ir.styles, start, end),
+    links: sliceLinkSpans(ir.links, start, end),
+  };
+}
+
+function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] {
+  const merged: MarkdownIR["styles"] = [];
+  for (const span of styles) {
+    const last = merged.at(-1);
+    if (last && last.style === span.style && span.start <= last.end) {
+      last.end = Math.max(last.end, span.end);
+      continue;
+    }
+    merged.push({ ...span });
+  }
+  return merged;
+}
+
+function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] {
+  const merged: MarkdownIR["links"] = [];
+  for (const link of links) {
+    const last = merged.at(-1);
+    if (last && last.href === link.href && link.start <= last.end) {
+      last.end = Math.max(last.end, link.end);
+      continue;
+    }
+    merged.push({ ...link });
+  }
+  return merged;
+}
+
+function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR {
+  const offset = left.text.length;
+  return {
+    text: left.text + right.text,
+    styles: mergeAdjacentStyleSpans([
+      ...left.styles,
+      ...right.styles.map((span) => ({
+        ...span,
+        start: span.start + offset,
+        end: span.end + offset,
+      })),
+    ]),
+    links: mergeAdjacentLinkSpans([
+      ...left.links,
+      ...right.links.map((link) => ({
+        ...link,
+        start: link.start + offset,
+        end: link.end + offset,
+      })),
+    ]),
+  };
+}
+
+function renderTelegramChunkHtml(ir: MarkdownIR): string {
+  return wrapFileReferencesInHtml(renderTelegramHtml(ir));
+}
+
+function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
+  const maxEnd = Math.min(text.length, start + limit);
+  if (maxEnd >= text.length) {
+    return text.length;
+  }
+
+  let lastOutsideParenNewlineBreak = -1;
+  let lastOutsideParenWhitespaceBreak = -1;
+  let lastOutsideParenWhitespaceRunStart = -1;
+  let lastAnyNewlineBreak = -1;
+  let lastAnyWhitespaceBreak = -1;
+  let lastAnyWhitespaceRunStart = -1;
+  let parenDepth = 0;
+  let sawNonWhitespace = false;
+
+  for (let index = start; index < maxEnd; index += 1) {
+    const char = text[index];
+    if (char === "(") {
+      sawNonWhitespace = true;
+      parenDepth += 1;
+      continue;
+    }
+    if (char === ")" && parenDepth > 0) {
+      sawNonWhitespace = true;
+      parenDepth -= 1;
+      continue;
+    }
+    if (!/\s/.test(char)) {
+      sawNonWhitespace = true;
+      continue;
+    }
+    if (!sawNonWhitespace) {
+      continue;
+    }
+    if (char === "\n") {
+      lastAnyNewlineBreak = index + 1;
+      if (parenDepth === 0) {
+        lastOutsideParenNewlineBreak = index + 1;
+      }
+      continue;
+    }
+    const whitespaceRunStart =
+      index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart;
+    lastAnyWhitespaceBreak = index + 1;
+    lastAnyWhitespaceRunStart = whitespaceRunStart;
+    if (parenDepth === 0) {
+      lastOutsideParenWhitespaceBreak = index + 1;
+      lastOutsideParenWhitespaceRunStart = whitespaceRunStart;
+    }
+  }
+
+  const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => {
+    if (breakIndex <= start) {
+      return breakIndex;
+    }
+    if (runStart <= start) {
+      return breakIndex;
+    }
+    return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex;
+  };
+
+  if (lastOutsideParenNewlineBreak > start) {
+    return lastOutsideParenNewlineBreak;
+  }
+  if (lastOutsideParenWhitespaceBreak > start) {
+    return resolveWhitespaceBreak(
+      lastOutsideParenWhitespaceBreak,
+      lastOutsideParenWhitespaceRunStart,
+    );
+  }
+  if (lastAnyNewlineBreak > start) {
+    return lastAnyNewlineBreak;
+  }
+  if (lastAnyWhitespaceBreak > start) {
+    return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
+  }
+  return maxEnd;
+}
+
 function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
  if (!ir.text) {
    return [];
@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
  const chunks: MarkdownIR[] = [];
  let cursor = 0;
  while (cursor < ir.text.length) {
-    const end = Math.min(ir.text.length, cursor + normalizedLimit);
+    const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit);
    chunks.push({
      text: ir.text.slice(cursor, end),
      styles: sliceStyleSpans(ir.styles, cursor, end),
@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
  return chunks;
 }

+function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] {
+  const coalesced: MarkdownIR[] = [];
+  let index = 0;
+
+  while (index < chunks.length) {
+    const chunk = chunks[index];
+    if (!chunk) {
+      index += 1;
+      continue;
+    }
+    if (chunk.text.trim().length > 0) {
+      coalesced.push(chunk);
+      index += 1;
+      continue;
+    }
+
+    const prev = coalesced.at(-1);
+    const next = chunks[index + 1];
+    const chunkLength = chunk.text.length;
+
+    const canMergePrev = (candidate: MarkdownIR) =>
+      renderTelegramChunkHtml(candidate).length <= limit;
+    const canMergeNext = (candidate: MarkdownIR) =>
+      renderTelegramChunkHtml(candidate).length <= limit;
+
+    if (prev) {
+      const mergedPrev = mergeMarkdownIRChunks(prev, chunk);
+      if (canMergePrev(mergedPrev)) {
+        coalesced[coalesced.length - 1] = mergedPrev;
+        index += 1;
+        continue;
+      }
+    }
+
+    if (next) {
+      const mergedNext = mergeMarkdownIRChunks(chunk, next);
+      if (canMergeNext(mergedNext)) {
+        chunks[index + 1] = mergedNext;
+        index += 1;
+        continue;
+      }
+    }
+
+    if (prev && next) {
+      for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) {
+        const prefix = sliceMarkdownIR(chunk, 0, prefixLength);
+        const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength);
+        const mergedPrev = mergeMarkdownIRChunks(prev, prefix);
+        const mergedNext = mergeMarkdownIRChunks(suffix, next);
+        if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) {
+          coalesced[coalesced.length - 1] = mergedPrev;
+          chunks[index + 1] = mergedNext;
+          break;
+        }
+      }
+    }
+
+    index += 1;
+  }
+
+  return coalesced;
+}
+
 function renderTelegramChunksWithinHtmlLimit(
  ir: MarkdownIR,
  limit: number,
 ): TelegramFormattedChunk[] {
  const normalizedLimit = Math.max(1, Math.floor(limit));
  const pending = chunkMarkdownIR(ir, normalizedLimit);
-  const rendered: TelegramFormattedChunk[] = [];
+  const finalized: MarkdownIR[] = [];
  while (pending.length > 0) {
    const chunk = pending.shift();
    if (!chunk) {
      continue;
    }
-    const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk));
+    const html = renderTelegramChunkHtml(chunk);
    if (html.length <= normalizedLimit || chunk.text.length <= 1) {
-      rendered.push({ html, text: chunk.text });
+      finalized.push(chunk);
      continue;
    }
    const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
    if (split.length <= 1) {
      // Worst-case safety: avoid retry loops, deliver the chunk as-is.
-      rendered.push({ html, text: chunk.text });
+      finalized.push(chunk);
      continue;
    }
    pending.unshift(...split);
  }
-  return rendered;
+  return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({
+    html: renderTelegramChunkHtml(chunk),
+    text: chunk.text,
+  }));
 }

 export function markdownToTelegramChunks(
--- a/extensions/telegram/src/format.wrap-md.test.ts
+++ b/extensions/telegram/src/format.wrap-md.test.ts
@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
    expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
  });
+
+  it("prefers word boundaries when html-limit retry splits formatted prose", () => {
+    const input = "**Which of these**";
+    const chunks = markdownToTelegramChunks(input, 16);
+    expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
+    expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
+  });
+
+  it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
+    const input = "**foo (bar baz qux quux**";
+    const chunks = markdownToTelegramChunks(input, 20);
+    expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
+    expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
+  });
+
+  it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
+    const input = "**ab  <<**";
+    const chunks = markdownToTelegramChunks(input, 11);
+    expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab  <<");
+    expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
+    expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
+  });
+
+  it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
+    const input = "ab\n\n<<";
+    const chunks = markdownToTelegramChunks(input, 6);
+    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
+    expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
+  });
 });

 describe("edge cases", () => {