fix: guard fine-split against breaking UTF-16 surrogate pairs

When re-splitting CJK-heavy segments at chunking.tokens, check whether the
slice boundary falls on a high surrogate (0xD800–0xDBFF) and if so extend
by one code unit to keep the pair intact.  Prevents producing broken
surrogate halves for CJK Extension B+ characters (U+20000+).

Add test verifying no lone surrogates appear when splitting lines of
surrogate-pair characters with an odd token budget.

Addresses third-round Codex P2 review comment.
This commit is contained in:
AaronLuo00 2026-03-08 19:16:09 -04:00 committed by Peter Steinberger
parent 3b95aa8804
commit f8547fcae4
2 changed files with 17 additions and 5 deletions

View File

@ -329,18 +329,22 @@ describe("chunkMarkdown", () => {
expect(cjkCount).toBeLessThanOrEqual(200 * 2);
}
});
it("does not break surrogate pairs when splitting long CJK lines", () => {
// "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
// A line of 500 such characters = 1000 UTF-16 code units.
// With tokens=99 (odd), the fine-split must not cut inside a pair.
const surrogateChar = "\u{20000}";
const surrogateChar = "\u{20000}"; // 𠀀
const longLine = surrogateChar.repeat(500);
const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
for (const chunk of chunks) {
// No chunk should contain the Unicode replacement character U+FFFD,
// which would indicate a broken surrogate pair.
expect(chunk.text).not.toContain("\uFFFD");
// Every character in the chunk should be a valid string (no lone surrogates).
for (let i = 0; i < chunk.text.length; i += 1) {
const code = chunk.text.charCodeAt(i);
if (code >= 0xd800 && code <= 0xdbff) {
// High surrogate must be followed by a low surrogate
const next = chunk.text.charCodeAt(i + 1);
expect(next).toBeGreaterThanOrEqual(0xdc00);
expect(next).toBeLessThanOrEqual(0xdfff);
@ -348,7 +352,6 @@ describe("chunkMarkdown", () => {
}
}
});
it("does not over-split long Latin lines (backward compat)", () => {
// 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones.
const longLatinLine = "a".repeat(2000);

View File

@ -406,8 +406,17 @@ export function chunkMarkdown(
const coarse = line.slice(start, start + maxChars);
if (estimateStringChars(coarse) > maxChars) {
const fineStep = Math.max(1, chunking.tokens);
for (let j = 0; j < coarse.length; j += fineStep) {
segments.push(coarse.slice(j, j + fineStep));
for (let j = 0; j < coarse.length; ) {
let end = Math.min(j + fineStep, coarse.length);
// Avoid splitting inside a UTF-16 surrogate pair (CJK Extension B+).
if (end < coarse.length) {
const code = coarse.charCodeAt(end - 1);
if (code >= 0xd800 && code <= 0xdbff) {
end += 1; // include the low surrogate
}
}
segments.push(coarse.slice(j, end));
j = end; // advance cursor to the adjusted boundary
}
} else {
segments.push(coarse);