diff --git a/packages/memory-host-sdk/src/host/internal.ts b/packages/memory-host-sdk/src/host/internal.ts index 89fd0030a94..004d91d23e2 100644 --- a/packages/memory-host-sdk/src/host/internal.ts +++ b/packages/memory-host-sdk/src/host/internal.ts @@ -398,8 +398,13 @@ export function chunkMarkdown( if (line.length === 0) { segments.push(""); } else { - for (let start = 0; start < line.length; start += maxChars) { - segments.push(line.slice(start, start + maxChars)); + // Use token count (not maxChars) as the split step so that CJK lines + // – where 1 char ≈ 1 token – are sliced into budget-sized segments. + // For Latin text the token count is ≥ maxChars/4, which still produces + // segments well within the char budget after weighting. + const splitStep = Math.max(1, chunking.tokens); + for (let start = 0; start < line.length; start += splitStep) { + segments.push(line.slice(start, start + splitStep)); } } for (const segment of segments) { diff --git a/src/utils/cjk-chars.test.ts b/src/utils/cjk-chars.test.ts index 0e8327d0a7d..50fe188dd1c 100644 --- a/src/utils/cjk-chars.test.ts +++ b/src/utils/cjk-chars.test.ts @@ -81,7 +81,6 @@ describe("estimateStringChars", () => { // "你" counts as 4, emoji remains 2 => total 6 expect(estimateStringChars("你😀")).toBe(6); }); - it("yields ~1 token per CJK char when divided by CHARS_PER_TOKEN_ESTIMATE", () => { // 10 CJK chars should estimate as ~10 tokens const cjk = "这是一个测试用的句子呢";