fix(memory): add CJK/Kana/Hangul support to MMR tokenize() for diversity detection

The tokenize() function only matched [a-z0-9_]+ patterns, returning an
empty set for CJK-only text. This made Jaccard similarity always 0 (or
always 1 for two empty sets) for CJK content, effectively disabling MMR
diversity detection.

Add support for:
- CJK Unified Ideographs (U+4E00–U+9FFF, U+3400–U+4DBF)
- Hiragana (U+3040–U+309F) and Katakana (U+30A0–U+30FF)
- Hangul Syllables (U+AC00–U+D7AF) and Jamo (U+1100–U+11FF)

Characters are extracted as unigrams, and bigrams are generated only
from characters that are adjacent in the original text (no spurious
bigrams across ASCII boundaries).

Fixes #28000
This commit is contained in:
buyitsydney 2026-02-28 11:43:05 +08:00 committed by Ayaan Zaidi
parent 92b8839488
commit 4b69c6d3f1
2 changed files with 92 additions and 4 deletions

View File

@ -30,6 +30,37 @@ describe("tokenize", () => {
input: "hello hello world world",
expected: ["hello", "world"],
},
{
name: "CJK characters produce unigrams and bigrams",
input: "今天讨论",
expected: ["今", "天", "讨", "论", "今天", "天讨", "讨论"],
},
{
name: "mixed ASCII and CJK",
input: "hello 你好世界 test",
expected: ["hello", "test", "你", "好", "世", "界", "你好", "好世", "世界"],
},
{
name: "single CJK character (no bigrams)",
input: "龙",
expected: ["龙"],
},
{
name: "non-adjacent CJK chars do not form bigrams",
input: "我a好",
expected: ["a", "我", "好"],
// No "我好" bigram — they are separated by "a"
},
{
name: "Japanese hiragana",
input: "こんにちは",
expected: ["こ", "ん", "に", "ち", "は", "こん", "んに", "にち", "ちは"],
},
{
name: "Korean hangul",
input: "안녕하세요",
expected: ["안", "녕", "하", "세", "요", "안녕", "녕하", "하세", "세요"],
},
] as const;
for (const testCase of cases) {
@ -90,10 +121,33 @@ describe("textSimilarity", () => {
{ name: "same words reordered", left: "hello world", right: "world hello", expected: 1 },
{ name: "different text", left: "hello world", right: "foo bar", expected: 0 },
{ name: "case insensitive", left: "Hello World", right: "hello world", expected: 1 },
{
name: "CJK similar texts share tokens",
left: "今天我们讨论了项目进展",
right: "今天我们讨论了会议安排",
// Shared unigrams: 今,天,我,们,讨,论,了 (7) + shared bigrams: 今天,天我,我们,们讨,讨论,论了 (6) = 13 shared
// Total unique tokens > 13, so similarity > 0 and < 1
expected: -1, // placeholder — just check > 0
},
{
name: "CJK completely different texts",
left: "苹果香蕉",
right: "钢铁煤炭",
expected: 0,
},
] as const;
for (const testCase of cases) {
expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(testCase.expected);
if (testCase.expected === -1) {
// Placeholder: just assert positive similarity
const sim = textSimilarity(testCase.left, testCase.right);
expect(sim, testCase.name).toBeGreaterThan(0);
expect(sim, testCase.name).toBeLessThan(1);
} else {
expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(
testCase.expected,
);
}
}
});
});

View File

@ -25,13 +25,47 @@ export const DEFAULT_MMR_CONFIG: MMRConfig = {
lambda: 0.7,
};
/**
* Regex matching CJK-family characters that lack whitespace word boundaries:
* - CJK Unified Ideographs (Chinese hanzi, Japanese kanji, Korean hanja)
* - CJK Extension A
* - Hiragana & Katakana (Japanese)
* - Hangul Syllables & Jamo (Korean)
*/
const CJK_RE = /[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\u1100-\u11ff]/;
/**
* Tokenize text for Jaccard similarity computation.
* Extracts alphanumeric tokens and normalizes to lowercase.
* Extracts alphanumeric tokens, CJK-family characters (unigrams),
* and consecutive CJK character pairs (bigrams).
*
* Bigrams are only created from characters that are adjacent in the
* original text, so mixed content like "我喜欢hello你好" will NOT
* produce the spurious bigram "欢你".
*/
export function tokenize(text: string): Set<string> {
const tokens = text.toLowerCase().match(/[a-z0-9_]+/g) ?? [];
return new Set(tokens);
const lower = text.toLowerCase();
const ascii = lower.match(/[a-z0-9_]+/g) ?? [];
// Track CJK characters with their original positions
const chars = Array.from(lower);
const cjkData: { char: string; index: number }[] = [];
for (let i = 0; i < chars.length; i++) {
if (CJK_RE.test(chars[i])) {
cjkData.push({ char: chars[i], index: i });
}
}
// Build bigrams only from originally adjacent CJK characters
const bigrams: string[] = [];
for (let i = 0; i < cjkData.length - 1; i++) {
if (cjkData[i + 1].index === cjkData[i].index + 1) {
bigrams.push(cjkData[i].char + cjkData[i + 1].char);
}
}
const unigrams = cjkData.map((d) => d.char);
return new Set([...ascii, ...bigrams, ...unigrams]);
}
/**