mirror of https://github.com/openclaw/openclaw.git
525 lines
9.6 KiB
TypeScript
525 lines
9.6 KiB
TypeScript
/**
|
|
* Query expansion for FTS-only search mode.
|
|
*
|
|
* When no embedding provider is available, we fall back to FTS (full-text search).
|
|
* FTS works best with specific keywords, but users often ask conversational queries
|
|
* like "that thing we discussed yesterday" or "之前讨论的那个方案".
|
|
*
|
|
* This module extracts meaningful keywords from such queries to improve FTS results.
|
|
*/
|
|
|
|
// Common stop words that don't add search value
|
|
const STOP_WORDS_EN = new Set([
|
|
// Articles and determiners
|
|
"a",
|
|
"an",
|
|
"the",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
// Pronouns
|
|
"i",
|
|
"me",
|
|
"my",
|
|
"we",
|
|
"our",
|
|
"you",
|
|
"your",
|
|
"he",
|
|
"she",
|
|
"it",
|
|
"they",
|
|
"them",
|
|
// Common verbs
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"have",
|
|
"has",
|
|
"had",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"will",
|
|
"would",
|
|
"could",
|
|
"should",
|
|
"can",
|
|
"may",
|
|
"might",
|
|
// Prepositions
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"to",
|
|
"for",
|
|
"of",
|
|
"with",
|
|
"by",
|
|
"from",
|
|
"about",
|
|
"into",
|
|
"through",
|
|
"during",
|
|
"before",
|
|
"after",
|
|
"above",
|
|
"below",
|
|
"between",
|
|
"under",
|
|
"over",
|
|
// Conjunctions
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"if",
|
|
"then",
|
|
"because",
|
|
"as",
|
|
"while",
|
|
"when",
|
|
"where",
|
|
"what",
|
|
"which",
|
|
"who",
|
|
"how",
|
|
"why",
|
|
// Time references (vague, not useful for FTS)
|
|
"yesterday",
|
|
"today",
|
|
"tomorrow",
|
|
"earlier",
|
|
"later",
|
|
"recently",
|
|
"before",
|
|
"ago",
|
|
"just",
|
|
"now",
|
|
// Vague references
|
|
"thing",
|
|
"things",
|
|
"stuff",
|
|
"something",
|
|
"anything",
|
|
"everything",
|
|
"nothing",
|
|
// Question words
|
|
"please",
|
|
"help",
|
|
"find",
|
|
"show",
|
|
"get",
|
|
"tell",
|
|
"give",
|
|
]);
|
|
|
|
const STOP_WORDS_KO = new Set([
|
|
// Particles (조사)
|
|
"은",
|
|
"는",
|
|
"이",
|
|
"가",
|
|
"을",
|
|
"를",
|
|
"의",
|
|
"에",
|
|
"에서",
|
|
"로",
|
|
"으로",
|
|
"와",
|
|
"과",
|
|
"도",
|
|
"만",
|
|
"까지",
|
|
"부터",
|
|
"한테",
|
|
"에게",
|
|
"께",
|
|
"처럼",
|
|
"같이",
|
|
"보다",
|
|
"마다",
|
|
"밖에",
|
|
"대로",
|
|
// Pronouns (대명사)
|
|
"나",
|
|
"나는",
|
|
"내가",
|
|
"나를",
|
|
"너",
|
|
"우리",
|
|
"저",
|
|
"저희",
|
|
"그",
|
|
"그녀",
|
|
"그들",
|
|
"이것",
|
|
"저것",
|
|
"그것",
|
|
"여기",
|
|
"저기",
|
|
"거기",
|
|
// Common verbs / auxiliaries (일반 동사/보조 동사)
|
|
"있다",
|
|
"없다",
|
|
"하다",
|
|
"되다",
|
|
"이다",
|
|
"아니다",
|
|
"보다",
|
|
"주다",
|
|
"오다",
|
|
"가다",
|
|
// Nouns (의존 명사 / vague)
|
|
"것",
|
|
"거",
|
|
"등",
|
|
"수",
|
|
"때",
|
|
"곳",
|
|
"중",
|
|
"분",
|
|
// Adverbs
|
|
"잘",
|
|
"더",
|
|
"또",
|
|
"매우",
|
|
"정말",
|
|
"아주",
|
|
"많이",
|
|
"너무",
|
|
"좀",
|
|
// Conjunctions
|
|
"그리고",
|
|
"하지만",
|
|
"그래서",
|
|
"그런데",
|
|
"그러나",
|
|
"또는",
|
|
"그러면",
|
|
// Question words
|
|
"왜",
|
|
"어떻게",
|
|
"뭐",
|
|
"언제",
|
|
"어디",
|
|
"누구",
|
|
"무엇",
|
|
"어떤",
|
|
// Time (vague)
|
|
"어제",
|
|
"오늘",
|
|
"내일",
|
|
"최근",
|
|
"지금",
|
|
"아까",
|
|
"나중",
|
|
"전에",
|
|
// Request words
|
|
"제발",
|
|
"부탁",
|
|
]);
|
|
|
|
// Common Korean trailing particles to strip from words for tokenization
|
|
// Sorted by descending length so longest-match-first is guaranteed.
|
|
const KO_TRAILING_PARTICLES = [
|
|
"에서",
|
|
"으로",
|
|
"에게",
|
|
"한테",
|
|
"처럼",
|
|
"같이",
|
|
"보다",
|
|
"까지",
|
|
"부터",
|
|
"마다",
|
|
"밖에",
|
|
"대로",
|
|
"은",
|
|
"는",
|
|
"이",
|
|
"가",
|
|
"을",
|
|
"를",
|
|
"의",
|
|
"에",
|
|
"로",
|
|
"와",
|
|
"과",
|
|
"도",
|
|
"만",
|
|
].toSorted((a, b) => b.length - a.length);
|
|
|
|
function stripKoreanTrailingParticle(token: string): string | null {
|
|
for (const particle of KO_TRAILING_PARTICLES) {
|
|
if (token.length > particle.length && token.endsWith(particle)) {
|
|
return token.slice(0, -particle.length);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function isUsefulKoreanStem(stem: string): boolean {
|
|
// Prevent bogus one-syllable stems from words like "논의" -> "논".
|
|
if (/[\uac00-\ud7af]/.test(stem)) {
|
|
return stem.length >= 2;
|
|
}
|
|
// Keep stripped ASCII stems for mixed tokens like "API를" -> "api".
|
|
return /^[a-z0-9_]+$/i.test(stem);
|
|
}
|
|
|
|
const STOP_WORDS_ZH = new Set([
|
|
// Pronouns
|
|
"我",
|
|
"我们",
|
|
"你",
|
|
"你们",
|
|
"他",
|
|
"她",
|
|
"它",
|
|
"他们",
|
|
"这",
|
|
"那",
|
|
"这个",
|
|
"那个",
|
|
"这些",
|
|
"那些",
|
|
// Auxiliary words
|
|
"的",
|
|
"了",
|
|
"着",
|
|
"过",
|
|
"得",
|
|
"地",
|
|
"吗",
|
|
"呢",
|
|
"吧",
|
|
"啊",
|
|
"呀",
|
|
"嘛",
|
|
"啦",
|
|
// Verbs (common, vague)
|
|
"是",
|
|
"有",
|
|
"在",
|
|
"被",
|
|
"把",
|
|
"给",
|
|
"让",
|
|
"用",
|
|
"到",
|
|
"去",
|
|
"来",
|
|
"做",
|
|
"说",
|
|
"看",
|
|
"找",
|
|
"想",
|
|
"要",
|
|
"能",
|
|
"会",
|
|
"可以",
|
|
// Prepositions and conjunctions
|
|
"和",
|
|
"与",
|
|
"或",
|
|
"但",
|
|
"但是",
|
|
"因为",
|
|
"所以",
|
|
"如果",
|
|
"虽然",
|
|
"而",
|
|
"也",
|
|
"都",
|
|
"就",
|
|
"还",
|
|
"又",
|
|
"再",
|
|
"才",
|
|
"只",
|
|
// Time (vague)
|
|
"之前",
|
|
"以前",
|
|
"之后",
|
|
"以后",
|
|
"刚才",
|
|
"现在",
|
|
"昨天",
|
|
"今天",
|
|
"明天",
|
|
"最近",
|
|
// Vague references
|
|
"东西",
|
|
"事情",
|
|
"事",
|
|
"什么",
|
|
"哪个",
|
|
"哪些",
|
|
"怎么",
|
|
"为什么",
|
|
"多少",
|
|
// Question/request words
|
|
"请",
|
|
"帮",
|
|
"帮忙",
|
|
"告诉",
|
|
]);
|
|
|
|
/**
|
|
* Check if a token looks like a meaningful keyword.
|
|
* Returns false for short tokens, numbers-only, etc.
|
|
*/
|
|
function isValidKeyword(token: string): boolean {
|
|
if (!token || token.length === 0) {
|
|
return false;
|
|
}
|
|
// Skip very short English words (likely stop words or fragments)
|
|
if (/^[a-zA-Z]+$/.test(token) && token.length < 3) {
|
|
return false;
|
|
}
|
|
// Skip pure numbers (not useful for semantic search)
|
|
if (/^\d+$/.test(token)) {
|
|
return false;
|
|
}
|
|
// Skip tokens that are all punctuation
|
|
if (/^[\p{P}\p{S}]+$/u.test(token)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Simple tokenizer that handles English, Chinese, and Korean text.
|
|
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
|
* For English, we split on whitespace and punctuation.
|
|
*/
|
|
function tokenize(text: string): string[] {
|
|
const tokens: string[] = [];
|
|
const normalized = text.toLowerCase().trim();
|
|
|
|
// Split into segments (English words, Chinese character sequences, etc.)
|
|
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
|
|
|
|
for (const segment of segments) {
|
|
// Check if segment contains CJK characters (Chinese)
|
|
if (/[\u4e00-\u9fff]/.test(segment)) {
|
|
// For Chinese, extract character n-grams (unigrams and bigrams)
|
|
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
|
// Add individual characters
|
|
tokens.push(...chars);
|
|
// Add bigrams for better phrase matching
|
|
for (let i = 0; i < chars.length - 1; i++) {
|
|
tokens.push(chars[i] + chars[i + 1]);
|
|
}
|
|
} else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
|
|
// For Korean (Hangul syllables and jamo), keep the word as-is unless it is
|
|
// effectively a stop word once trailing particles are removed.
|
|
const stem = stripKoreanTrailingParticle(segment);
|
|
const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem);
|
|
if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
|
|
tokens.push(segment);
|
|
}
|
|
// Also emit particle-stripped stems when they are useful keywords.
|
|
if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
|
|
tokens.push(stem);
|
|
}
|
|
} else {
|
|
// For non-CJK, keep as single token
|
|
tokens.push(segment);
|
|
}
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* Extract keywords from a conversational query for FTS search.
|
|
*
|
|
* Examples:
|
|
* - "that thing we discussed about the API" → ["discussed", "API"]
|
|
* - "之前讨论的那个方案" → ["讨论", "方案"]
|
|
* - "what was the solution for the bug" → ["solution", "bug"]
|
|
*/
|
|
export function extractKeywords(query: string): string[] {
|
|
const tokens = tokenize(query);
|
|
const keywords: string[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (const token of tokens) {
|
|
// Skip stop words
|
|
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token)) {
|
|
continue;
|
|
}
|
|
// Skip invalid keywords
|
|
if (!isValidKeyword(token)) {
|
|
continue;
|
|
}
|
|
// Skip duplicates
|
|
if (seen.has(token)) {
|
|
continue;
|
|
}
|
|
seen.add(token);
|
|
keywords.push(token);
|
|
}
|
|
|
|
return keywords;
|
|
}
|
|
|
|
/**
|
|
* Expand a query for FTS search.
|
|
* Returns both the original query and extracted keywords for OR-matching.
|
|
*
|
|
* @param query - User's original query
|
|
* @returns Object with original query and extracted keywords
|
|
*/
|
|
export function expandQueryForFts(query: string): {
|
|
original: string;
|
|
keywords: string[];
|
|
expanded: string;
|
|
} {
|
|
const original = query.trim();
|
|
const keywords = extractKeywords(original);
|
|
|
|
// Build expanded query: original terms OR extracted keywords
|
|
// This ensures both exact matches and keyword matches are found
|
|
const expanded = keywords.length > 0 ? `${original} OR ${keywords.join(" OR ")}` : original;
|
|
|
|
return { original, keywords, expanded };
|
|
}
|
|
|
|
/**
|
|
* Type for an optional LLM-based query expander.
|
|
* Can be provided to enhance keyword extraction with semantic understanding.
|
|
*/
|
|
export type LlmQueryExpander = (query: string) => Promise<string[]>;
|
|
|
|
/**
|
|
* Expand query with optional LLM assistance.
|
|
* Falls back to local extraction if LLM is unavailable or fails.
|
|
*/
|
|
export async function expandQueryWithLlm(
|
|
query: string,
|
|
llmExpander?: LlmQueryExpander,
|
|
): Promise<string[]> {
|
|
// If LLM expander is provided, try it first
|
|
if (llmExpander) {
|
|
try {
|
|
const llmKeywords = await llmExpander(query);
|
|
if (llmKeywords.length > 0) {
|
|
return llmKeywords;
|
|
}
|
|
} catch {
|
|
// LLM failed, fall back to local extraction
|
|
}
|
|
}
|
|
|
|
// Fall back to local keyword extraction
|
|
return extractKeywords(query);
|
|
}
|