fix(memory): add CJK/Kana/Hangul support to MMR tokenize() for diversity detection

The tokenize() function only matched [a-z0-9_]+ patterns, returning an empty set for CJK-only text. This made Jaccard similarity always 0 (or always 1 for two empty sets) for CJK content, effectively disabling MMR diversity detection. Add support for: - CJK Unified Ideographs (U+4E00–U+9FFF, U+3400–U+4DBF) - Hiragana (U+3040–U+309F) and Katakana (U+30A0–U+30FF) - Hangul Syllables (U+AC00–U+D7AF) and Jamo (U+1100–U+11FF) Characters are extracted as unigrams, and bigrams are generated only from characters that are adjacent in the original text (no spurious bigrams across ASCII boundaries). Fixes #28000
2026-02-28 11:43:05 +08:00 · 2026-02-28 11:43:05 +08:00 · 4b69c6d3f1
parent 92b8839488
commit 4b69c6d3f1
2 changed files with 92 additions and 4 deletions
--- a/extensions/memory-core/src/memory/mmr.test.ts
+++ b/extensions/memory-core/src/memory/mmr.test.ts
@ -30,6 +30,37 @@ describe("tokenize", () => {
        input: "hello hello world world",
        expected: ["hello", "world"],
      },
+      {
+        name: "CJK characters produce unigrams and bigrams",
+        input: "今天讨论",
+        expected: ["今", "天", "讨", "论", "今天", "天讨", "讨论"],
+      },
+      {
+        name: "mixed ASCII and CJK",
+        input: "hello 你好世界 test",
+        expected: ["hello", "test", "你", "好", "世", "界", "你好", "好世", "世界"],
+      },
+      {
+        name: "single CJK character (no bigrams)",
+        input: "龙",
+        expected: ["龙"],
+      },
+      {
+        name: "non-adjacent CJK chars do not form bigrams",
+        input: "我a好",
+        expected: ["a", "我", "好"],
+        // No "我好" bigram — they are separated by "a"
+      },
+      {
+        name: "Japanese hiragana",
+        input: "こんにちは",
+        expected: ["こ", "ん", "に", "ち", "は", "こん", "んに", "にち", "ちは"],
+      },
+      {
+        name: "Korean hangul",
+        input: "안녕하세요",
+        expected: ["안", "녕", "하", "세", "요", "안녕", "녕하", "하세", "세요"],
+      },
    ] as const;

    for (const testCase of cases) {
@ -90,10 +121,33 @@ describe("textSimilarity", () => {
      { name: "same words reordered", left: "hello world", right: "world hello", expected: 1 },
      { name: "different text", left: "hello world", right: "foo bar", expected: 0 },
      { name: "case insensitive", left: "Hello World", right: "hello world", expected: 1 },
+      {
+        name: "CJK similar texts share tokens",
+        left: "今天我们讨论了项目进展",
+        right: "今天我们讨论了会议安排",
+        // Shared unigrams: 今,天,我,们,讨,论,了 (7) + shared bigrams: 今天,天我,我们,们讨,讨论,论了 (6) = 13 shared
+        // Total unique tokens > 13, so similarity > 0 and < 1
+        expected: -1, // placeholder — just check > 0
+      },
+      {
+        name: "CJK completely different texts",
+        left: "苹果香蕉",
+        right: "钢铁煤炭",
+        expected: 0,
+      },
    ] as const;

    for (const testCase of cases) {
-      expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(testCase.expected);
+      if (testCase.expected === -1) {
+        // Placeholder: just assert positive similarity
+        const sim = textSimilarity(testCase.left, testCase.right);
+        expect(sim, testCase.name).toBeGreaterThan(0);
+        expect(sim, testCase.name).toBeLessThan(1);
+      } else {
+        expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(
+          testCase.expected,
+        );
+      }
    }
  });
 });
--- a/extensions/memory-core/src/memory/mmr.ts
+++ b/extensions/memory-core/src/memory/mmr.ts
@ -25,13 +25,47 @@ export const DEFAULT_MMR_CONFIG: MMRConfig = {
  lambda: 0.7,
 };

+/**
+ * Regex matching CJK-family characters that lack whitespace word boundaries:
+ * - CJK Unified Ideographs (Chinese hanzi, Japanese kanji, Korean hanja)
+ * - CJK Extension A
+ * - Hiragana & Katakana (Japanese)
+ * - Hangul Syllables & Jamo (Korean)
+ */
+const CJK_RE = /[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\u1100-\u11ff]/;
+
 /**
 * Tokenize text for Jaccard similarity computation.
- * Extracts alphanumeric tokens and normalizes to lowercase.
+ * Extracts alphanumeric tokens, CJK-family characters (unigrams),
+ * and consecutive CJK character pairs (bigrams).
+ *
+ * Bigrams are only created from characters that are adjacent in the
+ * original text, so mixed content like "我喜欢hello你好" will NOT
+ * produce the spurious bigram "欢你".
 */
 export function tokenize(text: string): Set<string> {
-  const tokens = text.toLowerCase().match(/[a-z0-9_]+/g) ?? [];
-  return new Set(tokens);
+  const lower = text.toLowerCase();
+  const ascii = lower.match(/[a-z0-9_]+/g) ?? [];
+
+  // Track CJK characters with their original positions
+  const chars = Array.from(lower);
+  const cjkData: { char: string; index: number }[] = [];
+  for (let i = 0; i < chars.length; i++) {
+    if (CJK_RE.test(chars[i])) {
+      cjkData.push({ char: chars[i], index: i });
+    }
+  }
+
+  // Build bigrams only from originally adjacent CJK characters
+  const bigrams: string[] = [];
+  for (let i = 0; i < cjkData.length - 1; i++) {
+    if (cjkData[i + 1].index === cjkData[i].index + 1) {
+      bigrams.push(cjkData[i].char + cjkData[i + 1].char);
+    }
+  }
+
+  const unigrams = cjkData.map((d) => d.char);
+  return new Set([...ascii, ...bigrams, ...unigrams]);
 }

 /**