mirror of https://github.com/openclaw/openclaw.git
fix(memory): add CJK/Kana/Hangul support to MMR tokenize() for diversity detection
The tokenize() function only matched [a-z0-9_]+ patterns, returning an empty set for CJK-only text. This made Jaccard similarity always 0 (or always 1 for two empty sets) for CJK content, effectively disabling MMR diversity detection. Add support for: - CJK Unified Ideographs (U+4E00–U+9FFF, U+3400–U+4DBF) - Hiragana (U+3040–U+309F) and Katakana (U+30A0–U+30FF) - Hangul Syllables (U+AC00–U+D7AF) and Jamo (U+1100–U+11FF) Characters are extracted as unigrams, and bigrams are generated only from characters that are adjacent in the original text (no spurious bigrams across ASCII boundaries). Fixes #28000
This commit is contained in:
parent
92b8839488
commit
4b69c6d3f1
|
|
@ -30,6 +30,37 @@ describe("tokenize", () => {
|
|||
input: "hello hello world world",
|
||||
expected: ["hello", "world"],
|
||||
},
|
||||
{
|
||||
name: "CJK characters produce unigrams and bigrams",
|
||||
input: "今天讨论",
|
||||
expected: ["今", "天", "讨", "论", "今天", "天讨", "讨论"],
|
||||
},
|
||||
{
|
||||
name: "mixed ASCII and CJK",
|
||||
input: "hello 你好世界 test",
|
||||
expected: ["hello", "test", "你", "好", "世", "界", "你好", "好世", "世界"],
|
||||
},
|
||||
{
|
||||
name: "single CJK character (no bigrams)",
|
||||
input: "龙",
|
||||
expected: ["龙"],
|
||||
},
|
||||
{
|
||||
name: "non-adjacent CJK chars do not form bigrams",
|
||||
input: "我a好",
|
||||
expected: ["a", "我", "好"],
|
||||
// No "我好" bigram — they are separated by "a"
|
||||
},
|
||||
{
|
||||
name: "Japanese hiragana",
|
||||
input: "こんにちは",
|
||||
expected: ["こ", "ん", "に", "ち", "は", "こん", "んに", "にち", "ちは"],
|
||||
},
|
||||
{
|
||||
name: "Korean hangul",
|
||||
input: "안녕하세요",
|
||||
expected: ["안", "녕", "하", "세", "요", "안녕", "녕하", "하세", "세요"],
|
||||
},
|
||||
] as const;
|
||||
|
||||
for (const testCase of cases) {
|
||||
|
|
@ -90,10 +121,33 @@ describe("textSimilarity", () => {
|
|||
{ name: "same words reordered", left: "hello world", right: "world hello", expected: 1 },
|
||||
{ name: "different text", left: "hello world", right: "foo bar", expected: 0 },
|
||||
{ name: "case insensitive", left: "Hello World", right: "hello world", expected: 1 },
|
||||
{
|
||||
name: "CJK similar texts share tokens",
|
||||
left: "今天我们讨论了项目进展",
|
||||
right: "今天我们讨论了会议安排",
|
||||
// Shared unigrams: 今,天,我,们,讨,论,了 (7) + shared bigrams: 今天,天我,我们,们讨,讨论,论了 (6) = 13 shared
|
||||
// Total unique tokens > 13, so similarity > 0 and < 1
|
||||
expected: -1, // placeholder — just check > 0
|
||||
},
|
||||
{
|
||||
name: "CJK completely different texts",
|
||||
left: "苹果香蕉",
|
||||
right: "钢铁煤炭",
|
||||
expected: 0,
|
||||
},
|
||||
] as const;
|
||||
|
||||
for (const testCase of cases) {
|
||||
expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(testCase.expected);
|
||||
if (testCase.expected === -1) {
|
||||
// Placeholder: just assert positive similarity
|
||||
const sim = textSimilarity(testCase.left, testCase.right);
|
||||
expect(sim, testCase.name).toBeGreaterThan(0);
|
||||
expect(sim, testCase.name).toBeLessThan(1);
|
||||
} else {
|
||||
expect(textSimilarity(testCase.left, testCase.right), testCase.name).toBe(
|
||||
testCase.expected,
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -25,13 +25,47 @@ export const DEFAULT_MMR_CONFIG: MMRConfig = {
|
|||
lambda: 0.7,
|
||||
};
|
||||
|
||||
/**
|
||||
* Regex matching CJK-family characters that lack whitespace word boundaries:
|
||||
* - CJK Unified Ideographs (Chinese hanzi, Japanese kanji, Korean hanja)
|
||||
* - CJK Extension A
|
||||
* - Hiragana & Katakana (Japanese)
|
||||
* - Hangul Syllables & Jamo (Korean)
|
||||
*/
|
||||
const CJK_RE = /[\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\u1100-\u11ff]/;
|
||||
|
||||
/**
|
||||
* Tokenize text for Jaccard similarity computation.
|
||||
* Extracts alphanumeric tokens and normalizes to lowercase.
|
||||
* Extracts alphanumeric tokens, CJK-family characters (unigrams),
|
||||
* and consecutive CJK character pairs (bigrams).
|
||||
*
|
||||
* Bigrams are only created from characters that are adjacent in the
|
||||
* original text, so mixed content like "我喜欢hello你好" will NOT
|
||||
* produce the spurious bigram "欢你".
|
||||
*/
|
||||
export function tokenize(text: string): Set<string> {
|
||||
const tokens = text.toLowerCase().match(/[a-z0-9_]+/g) ?? [];
|
||||
return new Set(tokens);
|
||||
const lower = text.toLowerCase();
|
||||
const ascii = lower.match(/[a-z0-9_]+/g) ?? [];
|
||||
|
||||
// Track CJK characters with their original positions
|
||||
const chars = Array.from(lower);
|
||||
const cjkData: { char: string; index: number }[] = [];
|
||||
for (let i = 0; i < chars.length; i++) {
|
||||
if (CJK_RE.test(chars[i])) {
|
||||
cjkData.push({ char: chars[i], index: i });
|
||||
}
|
||||
}
|
||||
|
||||
// Build bigrams only from originally adjacent CJK characters
|
||||
const bigrams: string[] = [];
|
||||
for (let i = 0; i < cjkData.length - 1; i++) {
|
||||
if (cjkData[i + 1].index === cjkData[i].index + 1) {
|
||||
bigrams.push(cjkData[i].char + cjkData[i + 1].char);
|
||||
}
|
||||
}
|
||||
|
||||
const unigrams = cjkData.map((d) => d.char);
|
||||
return new Set([...ascii, ...bigrams, ...unigrams]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue