diff --git a/packages/memory-host-sdk/src/host/internal.test.ts b/packages/memory-host-sdk/src/host/internal.test.ts index 764fbc24bf8..8e7a748d740 100644 --- a/packages/memory-host-sdk/src/host/internal.test.ts +++ b/packages/memory-host-sdk/src/host/internal.test.ts @@ -329,18 +329,22 @@ describe("chunkMarkdown", () => { expect(cjkCount).toBeLessThanOrEqual(200 * 2); } }); - it("does not break surrogate pairs when splitting long CJK lines", () => { // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character. + // A line of 500 such characters = 1000 UTF-16 code units. // With tokens=99 (odd), the fine-split must not cut inside a pair. - const surrogateChar = "\u{20000}"; + const surrogateChar = "\u{20000}"; // 𠀀 const longLine = surrogateChar.repeat(500); const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 }); for (const chunk of chunks) { + // No chunk should contain the Unicode replacement character U+FFFD, + // which would indicate a broken surrogate pair. expect(chunk.text).not.toContain("\uFFFD"); + // Every character in the chunk should be a valid string (no lone surrogates). for (let i = 0; i < chunk.text.length; i += 1) { const code = chunk.text.charCodeAt(i); if (code >= 0xd800 && code <= 0xdbff) { + // High surrogate must be followed by a low surrogate const next = chunk.text.charCodeAt(i + 1); expect(next).toBeGreaterThanOrEqual(0xdc00); expect(next).toBeLessThanOrEqual(0xdfff); @@ -348,7 +352,6 @@ describe("chunkMarkdown", () => { } } }); - it("does not over-split long Latin lines (backward compat)", () => { // 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones. const longLatinLine = "a".repeat(2000); diff --git a/packages/memory-host-sdk/src/host/internal.ts b/packages/memory-host-sdk/src/host/internal.ts index 55c12593472..45d4ceacb65 100644 --- a/packages/memory-host-sdk/src/host/internal.ts +++ b/packages/memory-host-sdk/src/host/internal.ts @@ -406,8 +406,17 @@ export function chunkMarkdown( const coarse = line.slice(start, start + maxChars); if (estimateStringChars(coarse) > maxChars) { const fineStep = Math.max(1, chunking.tokens); - for (let j = 0; j < coarse.length; j += fineStep) { - segments.push(coarse.slice(j, j + fineStep)); + for (let j = 0; j < coarse.length; ) { + let end = Math.min(j + fineStep, coarse.length); + // Avoid splitting inside a UTF-16 surrogate pair (CJK Extension B+). + if (end < coarse.length) { + const code = coarse.charCodeAt(end - 1); + if (code >= 0xd800 && code <= 0xdbff) { + end += 1; // include the low surrogate + } + } + segments.push(coarse.slice(j, end)); + j = end; // advance cursor to the adjusted boundary } } else { segments.push(coarse);