mirror of https://github.com/openclaw/openclaw.git
fix: guard fine-split against breaking UTF-16 surrogate pairs
When re-splitting CJK-heavy segments at chunking.tokens, check whether the slice boundary falls on a high surrogate (0xD800–0xDBFF) and if so extend by one code unit to keep the pair intact. Prevents producing broken surrogate halves for CJK Extension B+ characters (U+20000+). Add test verifying no lone surrogates appear when splitting lines of surrogate-pair characters with an odd token budget. Addresses third-round Codex P2 review comment.
This commit is contained in:
parent
3b95aa8804
commit
f8547fcae4
|
|
@ -329,18 +329,22 @@ describe("chunkMarkdown", () => {
|
|||
expect(cjkCount).toBeLessThanOrEqual(200 * 2);
|
||||
}
|
||||
});
|
||||
|
||||
it("does not break surrogate pairs when splitting long CJK lines", () => {
|
||||
// "𠀀" (U+20000) is a surrogate pair: 2 UTF-16 code units per character.
|
||||
// A line of 500 such characters = 1000 UTF-16 code units.
|
||||
// With tokens=99 (odd), the fine-split must not cut inside a pair.
|
||||
const surrogateChar = "\u{20000}";
|
||||
const surrogateChar = "\u{20000}"; // 𠀀
|
||||
const longLine = surrogateChar.repeat(500);
|
||||
const chunks = chunkMarkdown(longLine, { tokens: 99, overlap: 0 });
|
||||
for (const chunk of chunks) {
|
||||
// No chunk should contain the Unicode replacement character U+FFFD,
|
||||
// which would indicate a broken surrogate pair.
|
||||
expect(chunk.text).not.toContain("\uFFFD");
|
||||
// Every character in the chunk should be a valid string (no lone surrogates).
|
||||
for (let i = 0; i < chunk.text.length; i += 1) {
|
||||
const code = chunk.text.charCodeAt(i);
|
||||
if (code >= 0xd800 && code <= 0xdbff) {
|
||||
// High surrogate must be followed by a low surrogate
|
||||
const next = chunk.text.charCodeAt(i + 1);
|
||||
expect(next).toBeGreaterThanOrEqual(0xdc00);
|
||||
expect(next).toBeLessThanOrEqual(0xdfff);
|
||||
|
|
@ -348,7 +352,6 @@ describe("chunkMarkdown", () => {
|
|||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("does not over-split long Latin lines (backward compat)", () => {
|
||||
// 2000 ASCII chars / 800 maxChars -> about 3 segments, not 10 tiny ones.
|
||||
const longLatinLine = "a".repeat(2000);
|
||||
|
|
|
|||
|
|
@ -406,8 +406,17 @@ export function chunkMarkdown(
|
|||
const coarse = line.slice(start, start + maxChars);
|
||||
if (estimateStringChars(coarse) > maxChars) {
|
||||
const fineStep = Math.max(1, chunking.tokens);
|
||||
for (let j = 0; j < coarse.length; j += fineStep) {
|
||||
segments.push(coarse.slice(j, j + fineStep));
|
||||
for (let j = 0; j < coarse.length; ) {
|
||||
let end = Math.min(j + fineStep, coarse.length);
|
||||
// Avoid splitting inside a UTF-16 surrogate pair (CJK Extension B+).
|
||||
if (end < coarse.length) {
|
||||
const code = coarse.charCodeAt(end - 1);
|
||||
if (code >= 0xd800 && code <= 0xdbff) {
|
||||
end += 1; // include the low surrogate
|
||||
}
|
||||
}
|
||||
segments.push(coarse.slice(j, end));
|
||||
j = end; // advance cursor to the adjusted boundary
|
||||
}
|
||||
} else {
|
||||
segments.push(coarse);
|
||||
|
|
|
|||
Loading…
Reference in New Issue