diff --git a/extensions/memory-core/src/memory/mmr.test.ts b/extensions/memory-core/src/memory/mmr.test.ts index 621d1e509c8..d8a268ba7e9 100644 --- a/extensions/memory-core/src/memory/mmr.test.ts +++ b/extensions/memory-core/src/memory/mmr.test.ts @@ -30,6 +30,37 @@ describe("tokenize", () => { input: "hello hello world world", expected: ["hello", "world"], }, + { + name: "CJK characters produce unigrams and bigrams", + input: "今天讨论", + expected: ["今", "天", "讨", "论", "今天", "天讨", "讨论"], + }, + { + name: "mixed ASCII and CJK", + input: "hello 你好世界 test", + expected: ["hello", "test", "你", "好", "世", "界", "你好", "好世", "世界"], + }, + { + name: "single CJK character (no bigrams)", + input: "龙", + expected: ["龙"], + }, + { + name: "non-adjacent CJK chars do not form bigrams", + input: "我a好", + expected: ["a", "我", "好"], + // No "我好" bigram — they are separated by "a" + }, + { + name: "Japanese hiragana", + input: "こんにちは", + expected: ["こ", "ん", "に", "ち", "は", "こん", "んに", "にち", "ちは"], + }, + { + name: "Korean hangul", + input: "안녕하세요", + expected: ["안", "녕", "하", "세", "요", "안녕", "녕하", "하세", "세요"], + }, ] as const; for (const testCase of cases) { @@ -90,10 +121,33 @@ describe("textSimilarity", () => { { name: "same words reordered", left: "hello world", right: "world hello", expected: 1 }, { name: "different text", left: "hello world", right: "foo bar", expected: 0 }, { name: "case insensitive", left: "Hello World", right: "hello world", expected: 1 }, + { + name: "CJK similar texts share tokens", + left: "今天我们讨论了项目进展", + right: "今天我们讨论了会议安排", + // Shared unigrams: 今,天,我,们,讨,论,了 (7) + shared bigrams: 今天,天我,我们,们讨,讨论,论了 (6) = 13 shared + // Total unique tokens > 13, so similarity > 0 and < 1 + expected: -1, // placeholder — just check > 0 + }, + { + name: "CJK completely different texts", + left: "苹果香蕉", + right: "钢铁煤炭", + expected: 0, + }, ] as const; for (const testCase of cases) { - expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(testCase.expected); + if (testCase.expected === -1) { + // Placeholder: just assert positive similarity + const sim = textSimilarity(testCase.left, testCase.right); + expect(sim, testCase.name).toBeGreaterThan(0); + expect(sim, testCase.name).toBeLessThan(1); + } else { + expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe( + testCase.expected, + ); + } } }); }); diff --git a/extensions/memory-core/src/memory/mmr.ts b/extensions/memory-core/src/memory/mmr.ts index dc7144db10c..be98f18f3c2 100644 --- a/extensions/memory-core/src/memory/mmr.ts +++ b/extensions/memory-core/src/memory/mmr.ts @@ -25,13 +25,47 @@ export const DEFAULT_MMR_CONFIG: MMRConfig = { lambda: 0.7, }; +/** + * Regex matching CJK-family characters that lack whitespace word boundaries: + * - CJK Unified Ideographs (Chinese hanzi, Japanese kanji, Korean hanja) + * - CJK Extension A + * - Hiragana & Katakana (Japanese) + * - Hangul Syllables & Jamo (Korean) + */ +const CJK_RE = /[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\u1100-\u11ff]/; + /** * Tokenize text for Jaccard similarity computation. - * Extracts alphanumeric tokens and normalizes to lowercase. + * Extracts alphanumeric tokens, CJK-family characters (unigrams), + * and consecutive CJK character pairs (bigrams). + * + * Bigrams are only created from characters that are adjacent in the + * original text, so mixed content like "我喜欢hello你好" will NOT + * produce the spurious bigram "欢你". */ export function tokenize(text: string): Set { - const tokens = text.toLowerCase().match(/[a-z0-9_]+/g) ?? []; - return new Set(tokens); + const lower = text.toLowerCase(); + const ascii = lower.match(/[a-z0-9_]+/g) ?? []; + + // Track CJK characters with their original positions + const chars = Array.from(lower); + const cjkData: { char: string; index: number }[] = []; + for (let i = 0; i < chars.length; i++) { + if (CJK_RE.test(chars[i])) { + cjkData.push({ char: chars[i], index: i }); + } + } + + // Build bigrams only from originally adjacent CJK characters + const bigrams: string[] = []; + for (let i = 0; i < cjkData.length - 1; i++) { + if (cjkData[i + 1].index === cjkData[i].index + 1) { + bigrams.push(cjkData[i].char + cjkData[i + 1].char); + } + } + + const unigrams = cjkData.map((d) => d.char); + return new Set([...ascii, ...bigrams, ...unigrams]); } /**