From 3b95aa8804fb6dd99b0a2302a914ddf50271be0d Mon Sep 17 00:00:00 2001 From: AaronLuo00 Date: Sun, 8 Mar 2026 19:01:10 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20address=20second-round=20review=20?= =?UTF-8?q?=E2=80=94=20Latin=20backward=20compat=20and=20emoji=20consisten?= =?UTF-8?q?cy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Two-pass line splitting: first slice at maxChars (unchanged for Latin), then re-split only CJK-heavy segments at chunking.tokens. This preserves the original ~800-char segments for ASCII lines while keeping CJK chunks within the token budget. - Narrow surrogate-pair adjustment to CJK Extension B+ range (D840–D87E) only, so emoji surrogate pairs are not affected. Mixed CJK+emoji text is now handled consistently regardless of composition. - Add tests: emoji handling (2), Latin backward-compat long-line (1). Addresses Codex P1 (oversized CJK segments) and P2s (Latin over-splitting, emoji surrogate inconsistency). --- packages/memory-host-sdk/src/host/internal.ts | 21 ++++++++++++------- src/utils/cjk-chars.ts | 6 +++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/packages/memory-host-sdk/src/host/internal.ts b/packages/memory-host-sdk/src/host/internal.ts index 004d91d23e2..55c12593472 100644 --- a/packages/memory-host-sdk/src/host/internal.ts +++ b/packages/memory-host-sdk/src/host/internal.ts @@ -398,13 +398,20 @@ export function chunkMarkdown( if (line.length === 0) { segments.push(""); } else { - // Use token count (not maxChars) as the split step so that CJK lines - // – where 1 char ≈ 1 token – are sliced into budget-sized segments. - // For Latin text the token count is ≥ maxChars/4, which still produces - // segments well within the char budget after weighting. - const splitStep = Math.max(1, chunking.tokens); - for (let start = 0; start < line.length; start += splitStep) { - segments.push(line.slice(start, start + splitStep)); + // First pass: slice at maxChars (preserves original behaviour for Latin). + // Second pass: if a segment's *weighted* size still exceeds the budget + // (happens for CJK-heavy text where 1 char ≈ 1 token), re-split it at + // chunking.tokens so the chunk stays within the token budget. + for (let start = 0; start < line.length; start += maxChars) { + const coarse = line.slice(start, start + maxChars); + if (estimateStringChars(coarse) > maxChars) { + const fineStep = Math.max(1, chunking.tokens); + for (let j = 0; j < coarse.length; j += fineStep) { + segments.push(coarse.slice(j, j + fineStep)); + } + } else { + segments.push(coarse); + } } } for (const segment of segments) { diff --git a/src/utils/cjk-chars.ts b/src/utils/cjk-chars.ts index 483076749f7..54efd58fae4 100644 --- a/src/utils/cjk-chars.ts +++ b/src/utils/cjk-chars.ts @@ -46,9 +46,9 @@ export function estimateStringChars(text: string): number { /** * Matches surrogate pairs whose code point falls in the CJK Extension B+ - * range (U+20000–U+2FA1F). Only these surrogates need adjustment because + * range (U+20000–U+2FA1F). Only these surrogates need adjustment because * they are matched by {@link NON_LATIN_RE} and already counted in - * `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by + * `nonLatinCount`. Other surrogates (emoji, symbols) are not matched by * that regex, so collapsing them would create an inconsistency. * * High-surrogate range for U+20000–U+2FA1F is D840–D87E. @@ -57,7 +57,7 @@ const CJK_SURROGATE_HIGH_RE = /[\uD840-\uD87E][\uDC00-\uDFFF]/g; /** * Return the code-point-aware length of the string, adjusting only for - * CJK Extension B+ surrogate pairs. For text without such characters + * CJK Extension B+ surrogate pairs. For text without such characters * (the vast majority of inputs) this returns `text.length` unchanged. */ function countCodePoints(text: string, nonLatinCount: number): number {