fix: guard fine-split against breaking UTF-16 surrogate pairs

When re-splitting CJK-heavy segments at chunking.tokens, check whether the slice boundary falls on a high surrogate (0xD800–0xDBFF) and if so extend by one code unit to keep the pair intact. Prevents producing broken surrogate halves for CJK Extension B+ characters (U+20000+). Add test verifying no lone surrogates appear when splitting lines of surrogate-pair characters with an odd token budget. Addresses third-round Codex P2 review comment.
2026-03-08 19:16:09 -04:00 · 2026-03-08 19:16:09 -04:00 · f8547fcae4
parent 3b95aa8804
commit f8547fcae4
2 changed files with 17 additions and 5 deletions
--- a/packages/memory-host-sdk/src/host/internal.test.ts
+++ b/packages/memory-host-sdk/src/host/internal.test.ts
@ -329,18 +329,22 @@ describe("chunkMarkdown", () => {
      expect(cjkCount).toBeLessThanOrEqual(200 * 2);
    }
  });
-
  it("does not break surrogate pairs when splitting long CJK lines", () => {
    // "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
+    // A line of 500 such characters = 1000 UTF-16 code units.
    // With tokens=99 (odd), the fine-split must not cut inside a pair.
-    const surrogateChar = "\u{20000}";
+    const surrogateChar = "\u{20000}"; // 𠀀
    const longLine = surrogateChar.repeat(500);
    const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
    for (const chunk of chunks) {
+      // No chunk should contain the Unicode replacement character U+FFFD,
+      // which would indicate a broken surrogate pair.
      expect(chunk.text).not.toContain("\uFFFD");
+      // Every character in the chunk should be a valid string (no lone surrogates).
      for (let i = 0; i < chunk.text.length; i += 1) {
        const code = chunk.text.charCodeAt(i);
        if (code >= 0xd800 && code <= 0xdbff) {
+          // High surrogate must be followed by a low surrogate
          const next = chunk.text.charCodeAt(i + 1);
          expect(next).toBeGreaterThanOrEqual(0xdc00);
          expect(next).toBeLessThanOrEqual(0xdfff);
@ -348,7 +352,6 @@ describe("chunkMarkdown", () => {
      }
    }
  });
-
  it("does not over-split long Latin lines (backward compat)", () => {
    // 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones.
    const longLatinLine = "a".repeat(2000);
--- a/packages/memory-host-sdk/src/host/internal.ts
+++ b/packages/memory-host-sdk/src/host/internal.ts
@ -406,8 +406,17 @@ export function chunkMarkdown(
        const coarse = line.slice(start, start + maxChars);
        if (estimateStringChars(coarse) > maxChars) {
          const fineStep = Math.max(1, chunking.tokens);
-          for (let j = 0; j < coarse.length; j += fineStep) {
-            segments.push(coarse.slice(j, j + fineStep));
+          for (let j = 0; j < coarse.length; ) {
+            let end = Math.min(j + fineStep, coarse.length);
+            // Avoid splitting inside a UTF-16 surrogate pair (CJK Extension B+).
+            if (end < coarse.length) {
+              const code = coarse.charCodeAt(end - 1);
+              if (code >= 0xd800 && code <= 0xdbff) {
+                end += 1; // include the low surrogate
+              }
+            }
+            segments.push(coarse.slice(j, end));
+            j = end; // advance cursor to the adjusted boundary
          }
        } else {
          segments.push(coarse);