fix: preserve Telegram word boundaries when rechunking HTML (#47274)

* fix: preserve Telegram chunk word boundaries * fix: address Telegram chunking review feedback * fix: preserve Telegram retry separators * fix: preserve Telegram chunking boundaries (#47274)
2026-03-15 18:10:49 +05:30 · 2026-03-15 18:10:49 +05:30 · c4265a5f16
parent 26e0a3ee9a
commit c4265a5f16
3 changed files with 242 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
 - Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
 - Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
 - Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
 - Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274)
 ## 2026.3.13
--- a/extensions/telegram/src/format.ts
+++ b/extensions/telegram/src/format.ts
@ -512,6 +512,146 @@ function sliceLinkSpans(
  });
 }
 function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR {
  return {
    text: ir.text.slice(start, end),
    styles: sliceStyleSpans(ir.styles, start, end),
    links: sliceLinkSpans(ir.links, start, end),
  };
 }
 function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] {
  const merged: MarkdownIR["styles"] = [];
  for (const span of styles) {
    const last = merged.at(-1);
    if (last && last.style === span.style && span.start <= last.end) {
      last.end = Math.max(last.end, span.end);
      continue;
    }
    merged.push({ ...span });
  }
  return merged;
 }
 function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] {
  const merged: MarkdownIR["links"] = [];
  for (const link of links) {
    const last = merged.at(-1);
    if (last && last.href === link.href && link.start <= last.end) {
      last.end = Math.max(last.end, link.end);
      continue;
    }
    merged.push({ ...link });
  }
  return merged;
 }
 function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR {
  const offset = left.text.length;
  return {
    text: left.text + right.text,
    styles: mergeAdjacentStyleSpans([
      ...left.styles,
      ...right.styles.map((span) => ({
        ...span,
        start: span.start + offset,
        end: span.end + offset,
      })),
    ]),
    links: mergeAdjacentLinkSpans([
      ...left.links,
      ...right.links.map((link) => ({
        ...link,
        start: link.start + offset,
        end: link.end + offset,
      })),
    ]),
  };
 }
 function renderTelegramChunkHtml(ir: MarkdownIR): string {
  return wrapFileReferencesInHtml(renderTelegramHtml(ir));
 }
 function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
  const maxEnd = Math.min(text.length, start + limit);
  if (maxEnd >= text.length) {
    return text.length;
  }
  let lastOutsideParenNewlineBreak = -1;
  let lastOutsideParenWhitespaceBreak = -1;
  let lastOutsideParenWhitespaceRunStart = -1;
  let lastAnyNewlineBreak = -1;
  let lastAnyWhitespaceBreak = -1;
  let lastAnyWhitespaceRunStart = -1;
  let parenDepth = 0;
  let sawNonWhitespace = false;
  for (let index = start; index < maxEnd; index += 1) {
    const char = text[index];
    if (char === "(") {
      sawNonWhitespace = true;
      parenDepth += 1;
      continue;
    }
    if (char === ")" && parenDepth > 0) {
      sawNonWhitespace = true;
      parenDepth -= 1;
      continue;
    }
    if (!/\s/.test(char)) {
      sawNonWhitespace = true;
      continue;
    }
    if (!sawNonWhitespace) {
      continue;
    }
    if (char === "\n") {
      lastAnyNewlineBreak = index + 1;
      if (parenDepth === 0) {
        lastOutsideParenNewlineBreak = index + 1;
      }
      continue;
    }
    const whitespaceRunStart =
      index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart;
    lastAnyWhitespaceBreak = index + 1;
    lastAnyWhitespaceRunStart = whitespaceRunStart;
    if (parenDepth === 0) {
      lastOutsideParenWhitespaceBreak = index + 1;
      lastOutsideParenWhitespaceRunStart = whitespaceRunStart;
    }
  }
  const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => {
    if (breakIndex <= start) {
      return breakIndex;
    }
    if (runStart <= start) {
      return breakIndex;
    }
    return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex;
  };
  if (lastOutsideParenNewlineBreak > start) {
    return lastOutsideParenNewlineBreak;
  }
  if (lastOutsideParenWhitespaceBreak > start) {
    return resolveWhitespaceBreak(
      lastOutsideParenWhitespaceBreak,
      lastOutsideParenWhitespaceRunStart,
    );
  }
  if (lastAnyNewlineBreak > start) {
    return lastAnyNewlineBreak;
  }
  if (lastAnyWhitespaceBreak > start) {
    return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
  }
  return maxEnd;
 }
 function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
  if (!ir.text) {
    return [];
@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
  const chunks: MarkdownIR[] = [];
  let cursor = 0;
  while (cursor < ir.text.length) {
-    const end = Math.min(ir.text.length, cursor + normalizedLimit);
+    const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit);
    chunks.push({
      text: ir.text.slice(cursor, end),
      styles: sliceStyleSpans(ir.styles, cursor, end),
@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
  return chunks;
 }
 function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] {
  const coalesced: MarkdownIR[] = [];
  let index = 0;
  while (index < chunks.length) {
    const chunk = chunks[index];
    if (!chunk) {
      index += 1;
      continue;
    }
    if (chunk.text.trim().length > 0) {
      coalesced.push(chunk);
      index += 1;
      continue;
    }
    const prev = coalesced.at(-1);
    const next = chunks[index + 1];
    const chunkLength = chunk.text.length;
    const canMergePrev = (candidate: MarkdownIR) =>
      renderTelegramChunkHtml(candidate).length <= limit;
    const canMergeNext = (candidate: MarkdownIR) =>
      renderTelegramChunkHtml(candidate).length <= limit;
    if (prev) {
      const mergedPrev = mergeMarkdownIRChunks(prev, chunk);
      if (canMergePrev(mergedPrev)) {
        coalesced[coalesced.length - 1] = mergedPrev;
        index += 1;
        continue;
      }
    }
    if (next) {
      const mergedNext = mergeMarkdownIRChunks(chunk, next);
      if (canMergeNext(mergedNext)) {
        chunks[index + 1] = mergedNext;
        index += 1;
        continue;
      }
    }
    if (prev && next) {
      for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) {
        const prefix = sliceMarkdownIR(chunk, 0, prefixLength);
        const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength);
        const mergedPrev = mergeMarkdownIRChunks(prev, prefix);
        const mergedNext = mergeMarkdownIRChunks(suffix, next);
        if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) {
          coalesced[coalesced.length - 1] = mergedPrev;
          chunks[index + 1] = mergedNext;
          break;
        }
      }
    }
    index += 1;
  }
  return coalesced;
 }
 function renderTelegramChunksWithinHtmlLimit(
  ir: MarkdownIR,
  limit: number,
 ): TelegramFormattedChunk[] {
  const normalizedLimit = Math.max(1, Math.floor(limit));
  const pending = chunkMarkdownIR(ir, normalizedLimit);
-  const rendered: TelegramFormattedChunk[] = [];
+  const finalized: MarkdownIR[] = [];
  while (pending.length > 0) {
    const chunk = pending.shift();
    if (!chunk) {
      continue;
    }
-    const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk));
+    const html = renderTelegramChunkHtml(chunk);
    if (html.length <= normalizedLimit || chunk.text.length <= 1) {
-      rendered.push({ html, text: chunk.text });
+      finalized.push(chunk);
      continue;
    }
    const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
    if (split.length <= 1) {
      // Worst-case safety: avoid retry loops, deliver the chunk as-is.
-      rendered.push({ html, text: chunk.text });
+      finalized.push(chunk);
      continue;
    }
    pending.unshift(...split);
  }
-  return rendered;
+  return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({
    html: renderTelegramChunkHtml(chunk),
    text: chunk.text,
  }));
 }
 export function markdownToTelegramChunks(
--- a/extensions/telegram/src/format.wrap-md.test.ts
+++ b/extensions/telegram/src/format.wrap-md.test.ts
@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
    expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
  });
  it("prefers word boundaries when html-limit retry splits formatted prose", () => {
    const input = "**Which of these**";
    const chunks = markdownToTelegramChunks(input, 16);
    expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
    expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
  });
  it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
    const input = "**foo (bar baz qux quux**";
    const chunks = markdownToTelegramChunks(input, 20);
    expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
    expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
  });
  it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
    const input = "**ab  <<**";
    const chunks = markdownToTelegramChunks(input, 11);
    expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab  <<");
    expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
    expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
  });
  it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
    const input = "ab\n\n<<";
    const chunks = markdownToTelegramChunks(input, 6);
    expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
    expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
  });
 });
 describe("edge cases", () => {