fix: preserve Telegram word boundaries when rechunking HTML (#47274)

* fix: preserve Telegram chunk word boundaries

* fix: address Telegram chunking review feedback

* fix: preserve Telegram retry separators

* fix: preserve Telegram chunking boundaries (#47274)
This commit is contained in:
Ayaan Zaidi 2026-03-15 18:10:49 +05:30 committed by GitHub
parent 26e0a3ee9a
commit c4265a5f16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 242 additions and 6 deletions

View File

@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
- Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
- Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
- Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
- Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274)
## 2026.3.13

View File

@ -512,6 +512,146 @@ function sliceLinkSpans(
});
}
function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR {
return {
text: ir.text.slice(start, end),
styles: sliceStyleSpans(ir.styles, start, end),
links: sliceLinkSpans(ir.links, start, end),
};
}
function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] {
const merged: MarkdownIR["styles"] = [];
for (const span of styles) {
const last = merged.at(-1);
if (last && last.style === span.style && span.start <= last.end) {
last.end = Math.max(last.end, span.end);
continue;
}
merged.push({ ...span });
}
return merged;
}
function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] {
const merged: MarkdownIR["links"] = [];
for (const link of links) {
const last = merged.at(-1);
if (last && last.href === link.href && link.start <= last.end) {
last.end = Math.max(last.end, link.end);
continue;
}
merged.push({ ...link });
}
return merged;
}
function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR {
const offset = left.text.length;
return {
text: left.text + right.text,
styles: mergeAdjacentStyleSpans([
...left.styles,
...right.styles.map((span) => ({
...span,
start: span.start + offset,
end: span.end + offset,
})),
]),
links: mergeAdjacentLinkSpans([
...left.links,
...right.links.map((link) => ({
...link,
start: link.start + offset,
end: link.end + offset,
})),
]),
};
}
function renderTelegramChunkHtml(ir: MarkdownIR): string {
return wrapFileReferencesInHtml(renderTelegramHtml(ir));
}
function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
const maxEnd = Math.min(text.length, start + limit);
if (maxEnd >= text.length) {
return text.length;
}
let lastOutsideParenNewlineBreak = -1;
let lastOutsideParenWhitespaceBreak = -1;
let lastOutsideParenWhitespaceRunStart = -1;
let lastAnyNewlineBreak = -1;
let lastAnyWhitespaceBreak = -1;
let lastAnyWhitespaceRunStart = -1;
let parenDepth = 0;
let sawNonWhitespace = false;
for (let index = start; index < maxEnd; index += 1) {
const char = text[index];
if (char === "(") {
sawNonWhitespace = true;
parenDepth += 1;
continue;
}
if (char === ")" && parenDepth > 0) {
sawNonWhitespace = true;
parenDepth -= 1;
continue;
}
if (!/\s/.test(char)) {
sawNonWhitespace = true;
continue;
}
if (!sawNonWhitespace) {
continue;
}
if (char === "\n") {
lastAnyNewlineBreak = index + 1;
if (parenDepth === 0) {
lastOutsideParenNewlineBreak = index + 1;
}
continue;
}
const whitespaceRunStart =
index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart;
lastAnyWhitespaceBreak = index + 1;
lastAnyWhitespaceRunStart = whitespaceRunStart;
if (parenDepth === 0) {
lastOutsideParenWhitespaceBreak = index + 1;
lastOutsideParenWhitespaceRunStart = whitespaceRunStart;
}
}
const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => {
if (breakIndex <= start) {
return breakIndex;
}
if (runStart <= start) {
return breakIndex;
}
return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex;
};
if (lastOutsideParenNewlineBreak > start) {
return lastOutsideParenNewlineBreak;
}
if (lastOutsideParenWhitespaceBreak > start) {
return resolveWhitespaceBreak(
lastOutsideParenWhitespaceBreak,
lastOutsideParenWhitespaceRunStart,
);
}
if (lastAnyNewlineBreak > start) {
return lastAnyNewlineBreak;
}
if (lastAnyWhitespaceBreak > start) {
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
}
return maxEnd;
}
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
if (!ir.text) {
return [];
@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
const chunks: MarkdownIR[] = [];
let cursor = 0;
while (cursor < ir.text.length) {
const end = Math.min(ir.text.length, cursor + normalizedLimit);
const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit);
chunks.push({
text: ir.text.slice(cursor, end),
styles: sliceStyleSpans(ir.styles, cursor, end),
@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
return chunks;
}
function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] {
const coalesced: MarkdownIR[] = [];
let index = 0;
while (index < chunks.length) {
const chunk = chunks[index];
if (!chunk) {
index += 1;
continue;
}
if (chunk.text.trim().length > 0) {
coalesced.push(chunk);
index += 1;
continue;
}
const prev = coalesced.at(-1);
const next = chunks[index + 1];
const chunkLength = chunk.text.length;
const canMergePrev = (candidate: MarkdownIR) =>
renderTelegramChunkHtml(candidate).length <= limit;
const canMergeNext = (candidate: MarkdownIR) =>
renderTelegramChunkHtml(candidate).length <= limit;
if (prev) {
const mergedPrev = mergeMarkdownIRChunks(prev, chunk);
if (canMergePrev(mergedPrev)) {
coalesced[coalesced.length - 1] = mergedPrev;
index += 1;
continue;
}
}
if (next) {
const mergedNext = mergeMarkdownIRChunks(chunk, next);
if (canMergeNext(mergedNext)) {
chunks[index + 1] = mergedNext;
index += 1;
continue;
}
}
if (prev && next) {
for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) {
const prefix = sliceMarkdownIR(chunk, 0, prefixLength);
const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength);
const mergedPrev = mergeMarkdownIRChunks(prev, prefix);
const mergedNext = mergeMarkdownIRChunks(suffix, next);
if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) {
coalesced[coalesced.length - 1] = mergedPrev;
chunks[index + 1] = mergedNext;
break;
}
}
}
index += 1;
}
return coalesced;
}
function renderTelegramChunksWithinHtmlLimit(
ir: MarkdownIR,
limit: number,
): TelegramFormattedChunk[] {
const normalizedLimit = Math.max(1, Math.floor(limit));
const pending = chunkMarkdownIR(ir, normalizedLimit);
const rendered: TelegramFormattedChunk[] = [];
const finalized: MarkdownIR[] = [];
while (pending.length > 0) {
const chunk = pending.shift();
if (!chunk) {
continue;
}
const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk));
const html = renderTelegramChunkHtml(chunk);
if (html.length <= normalizedLimit || chunk.text.length <= 1) {
rendered.push({ html, text: chunk.text });
finalized.push(chunk);
continue;
}
const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
if (split.length <= 1) {
// Worst-case safety: avoid retry loops, deliver the chunk as-is.
rendered.push({ html, text: chunk.text });
finalized.push(chunk);
continue;
}
pending.unshift(...split);
}
return rendered;
return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({
html: renderTelegramChunkHtml(chunk),
text: chunk.text,
}));
}
export function markdownToTelegramChunks(

View File

@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
});
it("prefers word boundaries when html-limit retry splits formatted prose", () => {
const input = "**Which of these**";
const chunks = markdownToTelegramChunks(input, 16);
expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
});
it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
const input = "**foo (bar baz qux quux**";
const chunks = markdownToTelegramChunks(input, 20);
expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
});
it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
const input = "**ab <<**";
const chunks = markdownToTelegramChunks(input, 11);
expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab <<");
expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
});
it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
const input = "ab\n\n<<";
const chunks = markdownToTelegramChunks(input, 6);
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
});
});
describe("edge cases", () => {