mirror of https://github.com/openclaw/openclaw.git
fix: preserve Telegram word boundaries when rechunking HTML (#47274)
* fix: preserve Telegram chunk word boundaries * fix: address Telegram chunking review feedback * fix: preserve Telegram retry separators * fix: preserve Telegram chunking boundaries (#47274)
This commit is contained in:
parent
26e0a3ee9a
commit
c4265a5f16
|
|
@ -44,6 +44,7 @@ Docs: https://docs.openclaw.ai
|
||||||
- Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
|
- Email/webhook wrapping: sanitize sender and subject metadata before external-content wrapping so metadata fields cannot break the wrapper structure. Thanks @vincentkoc.
|
||||||
- Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
|
- Node/startup: remove leftover debug `console.log("node host PATH: ...")` that printed the resolved PATH on every `openclaw node run` invocation. (#46411)
|
||||||
- Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
|
- Telegram/message send: forward `--force-document` through the `sendPayload` path as well as `sendMedia`, so Telegram payload sends with `channelData` keep uploading images as documents instead of silently falling back to compressed photo sends. (#47119) Thanks @thepagent.
|
||||||
|
- Telegram/message chunking: preserve spaces, paragraph separators, and word boundaries when HTML overflow rechunking splits formatted replies. (#47274)
|
||||||
|
|
||||||
## 2026.3.13
|
## 2026.3.13
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -512,6 +512,146 @@ function sliceLinkSpans(
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function sliceMarkdownIR(ir: MarkdownIR, start: number, end: number): MarkdownIR {
|
||||||
|
return {
|
||||||
|
text: ir.text.slice(start, end),
|
||||||
|
styles: sliceStyleSpans(ir.styles, start, end),
|
||||||
|
links: sliceLinkSpans(ir.links, start, end),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeAdjacentStyleSpans(styles: MarkdownIR["styles"]): MarkdownIR["styles"] {
|
||||||
|
const merged: MarkdownIR["styles"] = [];
|
||||||
|
for (const span of styles) {
|
||||||
|
const last = merged.at(-1);
|
||||||
|
if (last && last.style === span.style && span.start <= last.end) {
|
||||||
|
last.end = Math.max(last.end, span.end);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
merged.push({ ...span });
|
||||||
|
}
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeAdjacentLinkSpans(links: MarkdownIR["links"]): MarkdownIR["links"] {
|
||||||
|
const merged: MarkdownIR["links"] = [];
|
||||||
|
for (const link of links) {
|
||||||
|
const last = merged.at(-1);
|
||||||
|
if (last && last.href === link.href && link.start <= last.end) {
|
||||||
|
last.end = Math.max(last.end, link.end);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
merged.push({ ...link });
|
||||||
|
}
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
function mergeMarkdownIRChunks(left: MarkdownIR, right: MarkdownIR): MarkdownIR {
|
||||||
|
const offset = left.text.length;
|
||||||
|
return {
|
||||||
|
text: left.text + right.text,
|
||||||
|
styles: mergeAdjacentStyleSpans([
|
||||||
|
...left.styles,
|
||||||
|
...right.styles.map((span) => ({
|
||||||
|
...span,
|
||||||
|
start: span.start + offset,
|
||||||
|
end: span.end + offset,
|
||||||
|
})),
|
||||||
|
]),
|
||||||
|
links: mergeAdjacentLinkSpans([
|
||||||
|
...left.links,
|
||||||
|
...right.links.map((link) => ({
|
||||||
|
...link,
|
||||||
|
start: link.start + offset,
|
||||||
|
end: link.end + offset,
|
||||||
|
})),
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderTelegramChunkHtml(ir: MarkdownIR): string {
|
||||||
|
return wrapFileReferencesInHtml(renderTelegramHtml(ir));
|
||||||
|
}
|
||||||
|
|
||||||
|
function findMarkdownIRPreservedSplitIndex(text: string, start: number, limit: number): number {
|
||||||
|
const maxEnd = Math.min(text.length, start + limit);
|
||||||
|
if (maxEnd >= text.length) {
|
||||||
|
return text.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
let lastOutsideParenNewlineBreak = -1;
|
||||||
|
let lastOutsideParenWhitespaceBreak = -1;
|
||||||
|
let lastOutsideParenWhitespaceRunStart = -1;
|
||||||
|
let lastAnyNewlineBreak = -1;
|
||||||
|
let lastAnyWhitespaceBreak = -1;
|
||||||
|
let lastAnyWhitespaceRunStart = -1;
|
||||||
|
let parenDepth = 0;
|
||||||
|
let sawNonWhitespace = false;
|
||||||
|
|
||||||
|
for (let index = start; index < maxEnd; index += 1) {
|
||||||
|
const char = text[index];
|
||||||
|
if (char === "(") {
|
||||||
|
sawNonWhitespace = true;
|
||||||
|
parenDepth += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (char === ")" && parenDepth > 0) {
|
||||||
|
sawNonWhitespace = true;
|
||||||
|
parenDepth -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!/\s/.test(char)) {
|
||||||
|
sawNonWhitespace = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!sawNonWhitespace) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (char === "\n") {
|
||||||
|
lastAnyNewlineBreak = index + 1;
|
||||||
|
if (parenDepth === 0) {
|
||||||
|
lastOutsideParenNewlineBreak = index + 1;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const whitespaceRunStart =
|
||||||
|
index === start || !/\s/.test(text[index - 1] ?? "") ? index : lastAnyWhitespaceRunStart;
|
||||||
|
lastAnyWhitespaceBreak = index + 1;
|
||||||
|
lastAnyWhitespaceRunStart = whitespaceRunStart;
|
||||||
|
if (parenDepth === 0) {
|
||||||
|
lastOutsideParenWhitespaceBreak = index + 1;
|
||||||
|
lastOutsideParenWhitespaceRunStart = whitespaceRunStart;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const resolveWhitespaceBreak = (breakIndex: number, runStart: number): number => {
|
||||||
|
if (breakIndex <= start) {
|
||||||
|
return breakIndex;
|
||||||
|
}
|
||||||
|
if (runStart <= start) {
|
||||||
|
return breakIndex;
|
||||||
|
}
|
||||||
|
return /\s/.test(text[breakIndex] ?? "") ? runStart : breakIndex;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (lastOutsideParenNewlineBreak > start) {
|
||||||
|
return lastOutsideParenNewlineBreak;
|
||||||
|
}
|
||||||
|
if (lastOutsideParenWhitespaceBreak > start) {
|
||||||
|
return resolveWhitespaceBreak(
|
||||||
|
lastOutsideParenWhitespaceBreak,
|
||||||
|
lastOutsideParenWhitespaceRunStart,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (lastAnyNewlineBreak > start) {
|
||||||
|
return lastAnyNewlineBreak;
|
||||||
|
}
|
||||||
|
if (lastAnyWhitespaceBreak > start) {
|
||||||
|
return resolveWhitespaceBreak(lastAnyWhitespaceBreak, lastAnyWhitespaceRunStart);
|
||||||
|
}
|
||||||
|
return maxEnd;
|
||||||
|
}
|
||||||
|
|
||||||
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
|
function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): MarkdownIR[] {
|
||||||
if (!ir.text) {
|
if (!ir.text) {
|
||||||
return [];
|
return [];
|
||||||
|
|
@ -523,7 +663,7 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
|
||||||
const chunks: MarkdownIR[] = [];
|
const chunks: MarkdownIR[] = [];
|
||||||
let cursor = 0;
|
let cursor = 0;
|
||||||
while (cursor < ir.text.length) {
|
while (cursor < ir.text.length) {
|
||||||
const end = Math.min(ir.text.length, cursor + normalizedLimit);
|
const end = findMarkdownIRPreservedSplitIndex(ir.text, cursor, normalizedLimit);
|
||||||
chunks.push({
|
chunks.push({
|
||||||
text: ir.text.slice(cursor, end),
|
text: ir.text.slice(cursor, end),
|
||||||
styles: sliceStyleSpans(ir.styles, cursor, end),
|
styles: sliceStyleSpans(ir.styles, cursor, end),
|
||||||
|
|
@ -534,32 +674,98 @@ function splitMarkdownIRPreserveWhitespace(ir: MarkdownIR, limit: number): Markd
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function coalesceWhitespaceOnlyMarkdownIRChunks(chunks: MarkdownIR[], limit: number): MarkdownIR[] {
|
||||||
|
const coalesced: MarkdownIR[] = [];
|
||||||
|
let index = 0;
|
||||||
|
|
||||||
|
while (index < chunks.length) {
|
||||||
|
const chunk = chunks[index];
|
||||||
|
if (!chunk) {
|
||||||
|
index += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (chunk.text.trim().length > 0) {
|
||||||
|
coalesced.push(chunk);
|
||||||
|
index += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const prev = coalesced.at(-1);
|
||||||
|
const next = chunks[index + 1];
|
||||||
|
const chunkLength = chunk.text.length;
|
||||||
|
|
||||||
|
const canMergePrev = (candidate: MarkdownIR) =>
|
||||||
|
renderTelegramChunkHtml(candidate).length <= limit;
|
||||||
|
const canMergeNext = (candidate: MarkdownIR) =>
|
||||||
|
renderTelegramChunkHtml(candidate).length <= limit;
|
||||||
|
|
||||||
|
if (prev) {
|
||||||
|
const mergedPrev = mergeMarkdownIRChunks(prev, chunk);
|
||||||
|
if (canMergePrev(mergedPrev)) {
|
||||||
|
coalesced[coalesced.length - 1] = mergedPrev;
|
||||||
|
index += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next) {
|
||||||
|
const mergedNext = mergeMarkdownIRChunks(chunk, next);
|
||||||
|
if (canMergeNext(mergedNext)) {
|
||||||
|
chunks[index + 1] = mergedNext;
|
||||||
|
index += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prev && next) {
|
||||||
|
for (let prefixLength = chunkLength - 1; prefixLength >= 1; prefixLength -= 1) {
|
||||||
|
const prefix = sliceMarkdownIR(chunk, 0, prefixLength);
|
||||||
|
const suffix = sliceMarkdownIR(chunk, prefixLength, chunkLength);
|
||||||
|
const mergedPrev = mergeMarkdownIRChunks(prev, prefix);
|
||||||
|
const mergedNext = mergeMarkdownIRChunks(suffix, next);
|
||||||
|
if (canMergePrev(mergedPrev) && canMergeNext(mergedNext)) {
|
||||||
|
coalesced[coalesced.length - 1] = mergedPrev;
|
||||||
|
chunks[index + 1] = mergedNext;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return coalesced;
|
||||||
|
}
|
||||||
|
|
||||||
function renderTelegramChunksWithinHtmlLimit(
|
function renderTelegramChunksWithinHtmlLimit(
|
||||||
ir: MarkdownIR,
|
ir: MarkdownIR,
|
||||||
limit: number,
|
limit: number,
|
||||||
): TelegramFormattedChunk[] {
|
): TelegramFormattedChunk[] {
|
||||||
const normalizedLimit = Math.max(1, Math.floor(limit));
|
const normalizedLimit = Math.max(1, Math.floor(limit));
|
||||||
const pending = chunkMarkdownIR(ir, normalizedLimit);
|
const pending = chunkMarkdownIR(ir, normalizedLimit);
|
||||||
const rendered: TelegramFormattedChunk[] = [];
|
const finalized: MarkdownIR[] = [];
|
||||||
while (pending.length > 0) {
|
while (pending.length > 0) {
|
||||||
const chunk = pending.shift();
|
const chunk = pending.shift();
|
||||||
if (!chunk) {
|
if (!chunk) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const html = wrapFileReferencesInHtml(renderTelegramHtml(chunk));
|
const html = renderTelegramChunkHtml(chunk);
|
||||||
if (html.length <= normalizedLimit || chunk.text.length <= 1) {
|
if (html.length <= normalizedLimit || chunk.text.length <= 1) {
|
||||||
rendered.push({ html, text: chunk.text });
|
finalized.push(chunk);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
|
const split = splitTelegramChunkByHtmlLimit(chunk, normalizedLimit, html.length);
|
||||||
if (split.length <= 1) {
|
if (split.length <= 1) {
|
||||||
// Worst-case safety: avoid retry loops, deliver the chunk as-is.
|
// Worst-case safety: avoid retry loops, deliver the chunk as-is.
|
||||||
rendered.push({ html, text: chunk.text });
|
finalized.push(chunk);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
pending.unshift(...split);
|
pending.unshift(...split);
|
||||||
}
|
}
|
||||||
return rendered;
|
return coalesceWhitespaceOnlyMarkdownIRChunks(finalized, normalizedLimit).map((chunk) => ({
|
||||||
|
html: renderTelegramChunkHtml(chunk),
|
||||||
|
text: chunk.text,
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
export function markdownToTelegramChunks(
|
export function markdownToTelegramChunks(
|
||||||
|
|
|
||||||
|
|
@ -174,6 +174,35 @@ describe("markdownToTelegramChunks - file reference wrapping", () => {
|
||||||
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
||||||
expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
|
expect(chunks.every((chunk) => chunk.html.length <= 5)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("prefers word boundaries when html-limit retry splits formatted prose", () => {
|
||||||
|
const input = "**Which of these**";
|
||||||
|
const chunks = markdownToTelegramChunks(input, 16);
|
||||||
|
expect(chunks.map((chunk) => chunk.text)).toEqual(["Which of ", "these"]);
|
||||||
|
expect(chunks.every((chunk) => chunk.html.length <= 16)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to in-paren word boundaries when the parenthesis is unbalanced", () => {
|
||||||
|
const input = "**foo (bar baz qux quux**";
|
||||||
|
const chunks = markdownToTelegramChunks(input, 20);
|
||||||
|
expect(chunks.map((chunk) => chunk.text)).toEqual(["foo", "(bar baz qux ", "quux"]);
|
||||||
|
expect(chunks.every((chunk) => chunk.html.length <= 20)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not emit whitespace-only chunks during html-limit retry splitting", () => {
|
||||||
|
const input = "**ab <<**";
|
||||||
|
const chunks = markdownToTelegramChunks(input, 11);
|
||||||
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe("ab <<");
|
||||||
|
expect(chunks.every((chunk) => chunk.text.trim().length > 0)).toBe(true);
|
||||||
|
expect(chunks.every((chunk) => chunk.html.length <= 11)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("preserves paragraph separators when retry chunking produces whitespace-only spans", () => {
|
||||||
|
const input = "ab\n\n<<";
|
||||||
|
const chunks = markdownToTelegramChunks(input, 6);
|
||||||
|
expect(chunks.map((chunk) => chunk.text).join("")).toBe(input);
|
||||||
|
expect(chunks.every((chunk) => chunk.html.length <= 6)).toBe(true);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("edge cases", () => {
|
describe("edge cases", () => {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue