mirror of https://github.com/openclaw/openclaw.git
Memory: add configurable FTS5 tokenizer for CJK text support (openclaw#56707)
Verified: - pnpm build - pnpm check - pnpm test -- extensions/memory-core/src/memory/manager-search.test.ts packages/memory-host-sdk/src/host/query-expansion.test.ts - pnpm test -- extensions/memory-core/src/memory/index.test.ts -t "reindexes when extraPaths change" - pnpm test -- src/config/schema.base.generated.test.ts - pnpm test -- src/media-understanding/image.test.ts - pnpm test Co-authored-by: Mitsuyuki Osabe <24588751+carrotRakko@users.noreply.github.com>
This commit is contained in:
parent
6f7ff545dd
commit
3ce48aff66
|
|
@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai
|
|||
- Memory/QMD: honor `memory.qmd.update.embedInterval` even when regular QMD update cadence is disabled or slower by arming a dedicated embed-cadence maintenance timer, while avoiding redundant timers when regular updates are already frequent enough. (#37326) Thanks @barronlroth.
|
||||
- Agents/memory flush: keep daily memory flush files append-only during embedded attempts so compaction writes do not overwrite earlier notes. (#53725) Thanks @HPluseven.
|
||||
- Web UI/markdown: stop bare auto-links from swallowing adjacent CJK text while preserving valid mixed-script path and query characters in rendered links. (#48410) Thanks @jnuyao.
|
||||
- Memory/FTS: add configurable trigram tokenization plus short-CJK substring fallback so memory search can find Chinese, Japanese, and Korean text without breaking mixed long-and-short queries. Thanks @carrotRakko.
|
||||
|
||||
## 2026.3.28
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
import {
|
||||
ensureMemoryIndexSchema,
|
||||
requireNodeSqlite,
|
||||
} from "openclaw/plugin-sdk/memory-core-host-engine-storage";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { bm25RankToScore, buildFtsQuery } from "./hybrid.js";
|
||||
import { searchKeyword } from "./manager-search.js";
|
||||
|
||||
describe("searchKeyword trigram fallback", () => {
|
||||
const { DatabaseSync } = requireNodeSqlite();
|
||||
|
||||
function createTrigramDb() {
|
||||
const db = new DatabaseSync(":memory:");
|
||||
ensureMemoryIndexSchema({
|
||||
db,
|
||||
embeddingCacheTable: "embedding_cache",
|
||||
cacheEnabled: false,
|
||||
ftsTable: "chunks_fts",
|
||||
ftsEnabled: true,
|
||||
ftsTokenizer: "trigram",
|
||||
});
|
||||
return db;
|
||||
}
|
||||
|
||||
async function runSearch(params: {
|
||||
rows: Array<{ id: string; path: string; text: string }>;
|
||||
query: string;
|
||||
}) {
|
||||
const db = createTrigramDb();
|
||||
try {
|
||||
const insert = db.prepare(
|
||||
"INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||
);
|
||||
for (const row of params.rows) {
|
||||
insert.run(row.text, row.id, row.path, "memory", "mock-embed", 1, 1);
|
||||
}
|
||||
return await searchKeyword({
|
||||
db,
|
||||
ftsTable: "chunks_fts",
|
||||
providerModel: "mock-embed",
|
||||
query: params.query,
|
||||
ftsTokenizer: "trigram",
|
||||
limit: 10,
|
||||
snippetMaxChars: 200,
|
||||
sourceFilter: { sql: "", params: [] },
|
||||
buildFtsQuery,
|
||||
bm25RankToScore,
|
||||
});
|
||||
} finally {
|
||||
db.close();
|
||||
}
|
||||
}
|
||||
|
||||
it("finds short Chinese queries with substring fallback", async () => {
|
||||
const results = await runSearch({
|
||||
rows: [{ id: "1", path: "memory/zh.md", text: "今天玩成语接龙游戏" }],
|
||||
query: "成语",
|
||||
});
|
||||
expect(results.map((row) => row.id)).toContain("1");
|
||||
expect(results[0]?.textScore).toBe(1);
|
||||
});
|
||||
|
||||
it("finds short Japanese and Korean queries with substring fallback", async () => {
|
||||
const japaneseResults = await runSearch({
|
||||
rows: [{ id: "jp", path: "memory/jp.md", text: "今日はしりとり大会" }],
|
||||
query: "しり とり",
|
||||
});
|
||||
expect(japaneseResults.map((row) => row.id)).toEqual(["jp"]);
|
||||
|
||||
const koreanResults = await runSearch({
|
||||
rows: [{ id: "ko", path: "memory/ko.md", text: "오늘 끝말잇기 게임을 했다" }],
|
||||
query: "끝말",
|
||||
});
|
||||
expect(koreanResults.map((row) => row.id)).toEqual(["ko"]);
|
||||
});
|
||||
|
||||
it("keeps MATCH semantics for long trigram terms while requiring short CJK substrings", async () => {
|
||||
const results = await runSearch({
|
||||
rows: [
|
||||
{ id: "match", path: "memory/good.md", text: "今天玩成语接龙游戏" },
|
||||
{ id: "partial", path: "memory/partial.md", text: "今天玩成语接龙" },
|
||||
],
|
||||
query: "成语接龙 游戏",
|
||||
});
|
||||
expect(results.map((row) => row.id)).toEqual(["match"]);
|
||||
expect(results[0]?.textScore).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
|
@ -7,6 +7,8 @@ import {
|
|||
|
||||
const vectorToBlob = (embedding: number[]): Buffer =>
|
||||
Buffer.from(new Float32Array(embedding).buffer);
|
||||
const FTS_QUERY_TOKEN_RE = /[\p{L}\p{N}_]+/gu;
|
||||
const SHORT_CJK_TRIGRAM_RE = /[\u3040-\u30ff\u3400-\u9fff\uac00-\ud7af\u3131-\u3163]/u;
|
||||
|
||||
export type SearchSource = string;
|
||||
|
||||
|
|
@ -20,6 +22,55 @@ export type SearchRowResult = {
|
|||
source: SearchSource;
|
||||
};
|
||||
|
||||
function escapeLikePattern(term: string): string {
|
||||
return term.replaceAll("\\", "\\\\").replaceAll("%", "\\%").replaceAll("_", "\\_");
|
||||
}
|
||||
|
||||
function buildMatchQueryFromTerms(terms: string[]): string | null {
|
||||
if (terms.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const quoted = terms.map((term) => `"${term.replaceAll('"', "")}"`);
|
||||
return quoted.join(" AND ");
|
||||
}
|
||||
|
||||
function planKeywordSearch(params: {
|
||||
query: string;
|
||||
ftsTokenizer?: "unicode61" | "trigram";
|
||||
buildFtsQuery: (raw: string) => string | null;
|
||||
}): { matchQuery: string | null; substringTerms: string[] } {
|
||||
if (params.ftsTokenizer !== "trigram") {
|
||||
return {
|
||||
matchQuery: params.buildFtsQuery(params.query),
|
||||
substringTerms: [],
|
||||
};
|
||||
}
|
||||
|
||||
const tokens =
|
||||
params.query
|
||||
.match(FTS_QUERY_TOKEN_RE)
|
||||
?.map((token) => token.trim())
|
||||
.filter(Boolean) ?? [];
|
||||
if (tokens.length === 0) {
|
||||
return { matchQuery: null, substringTerms: [] };
|
||||
}
|
||||
|
||||
const matchTerms: string[] = [];
|
||||
const substringTerms: string[] = [];
|
||||
for (const token of tokens) {
|
||||
if (SHORT_CJK_TRIGRAM_RE.test(token) && Array.from(token).length < 3) {
|
||||
substringTerms.push(token);
|
||||
continue;
|
||||
}
|
||||
matchTerms.push(token);
|
||||
}
|
||||
|
||||
return {
|
||||
matchQuery: buildMatchQueryFromTerms(matchTerms),
|
||||
substringTerms,
|
||||
};
|
||||
}
|
||||
|
||||
export async function searchVector(params: {
|
||||
db: DatabaseSync;
|
||||
vectorTable: string;
|
||||
|
|
@ -141,6 +192,7 @@ export async function searchKeyword(params: {
|
|||
ftsTable: string;
|
||||
providerModel: string | undefined;
|
||||
query: string;
|
||||
ftsTokenizer?: "unicode61" | "trigram";
|
||||
limit: number;
|
||||
snippetMaxChars: number;
|
||||
sourceFilter: { sql: string; params: SearchSource[] };
|
||||
|
|
@ -150,25 +202,42 @@ export async function searchKeyword(params: {
|
|||
if (params.limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
const ftsQuery = params.buildFtsQuery(params.query);
|
||||
if (!ftsQuery) {
|
||||
const plan = planKeywordSearch({
|
||||
query: params.query,
|
||||
ftsTokenizer: params.ftsTokenizer,
|
||||
buildFtsQuery: params.buildFtsQuery,
|
||||
});
|
||||
if (!plan.matchQuery && plan.substringTerms.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// When providerModel is undefined (FTS-only mode), search all models
|
||||
const modelClause = params.providerModel ? " AND model = ?" : "";
|
||||
const modelParams = params.providerModel ? [params.providerModel] : [];
|
||||
const substringClause = plan.substringTerms.map(() => " AND text LIKE ? ESCAPE '\\'").join("");
|
||||
const substringParams = plan.substringTerms.map((term) => `%${escapeLikePattern(term)}%`);
|
||||
const whereClause = plan.matchQuery
|
||||
? `${params.ftsTable} MATCH ?${substringClause}${modelClause}${params.sourceFilter.sql}`
|
||||
: `1=1${substringClause}${modelClause}${params.sourceFilter.sql}`;
|
||||
const queryParams = [
|
||||
...(plan.matchQuery ? [plan.matchQuery] : []),
|
||||
...substringParams,
|
||||
...modelParams,
|
||||
...params.sourceFilter.params,
|
||||
params.limit,
|
||||
];
|
||||
const rankExpression = plan.matchQuery ? `bm25(${params.ftsTable})` : "0";
|
||||
|
||||
const rows = params.db
|
||||
.prepare(
|
||||
`SELECT id, path, source, start_line, end_line, text,\n` +
|
||||
` bm25(${params.ftsTable}) AS rank\n` +
|
||||
` ${rankExpression} AS rank\n` +
|
||||
` FROM ${params.ftsTable}\n` +
|
||||
` WHERE ${params.ftsTable} MATCH ?${modelClause}${params.sourceFilter.sql}\n` +
|
||||
` WHERE ${whereClause}\n` +
|
||||
` ORDER BY rank ASC\n` +
|
||||
` LIMIT ?`,
|
||||
)
|
||||
.all(ftsQuery, ...modelParams, ...params.sourceFilter.params, params.limit) as Array<{
|
||||
.all(...queryParams) as Array<{
|
||||
id: string;
|
||||
path: string;
|
||||
source: SearchSource;
|
||||
|
|
@ -179,7 +248,7 @@ export async function searchKeyword(params: {
|
|||
}>;
|
||||
|
||||
return rows.map((row) => {
|
||||
const textScore = params.bm25RankToScore(row.rank);
|
||||
const textScore = plan.matchQuery ? params.bm25RankToScore(row.rank) : 1;
|
||||
return {
|
||||
id: row.id,
|
||||
path: row.path,
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ type MemoryIndexMeta = {
|
|||
chunkTokens: number;
|
||||
chunkOverlap: number;
|
||||
vectorDims?: number;
|
||||
ftsTokenizer?: string;
|
||||
};
|
||||
|
||||
type MemorySyncProgressState = {
|
||||
|
|
@ -362,6 +363,7 @@ export abstract class MemoryManagerSyncOps {
|
|||
cacheEnabled: this.cache.enabled,
|
||||
ftsTable: FTS_TABLE,
|
||||
ftsEnabled: this.fts.enabled,
|
||||
ftsTokenizer: this.settings.store.fts.tokenizer,
|
||||
});
|
||||
this.fts.available = result.ftsAvailable;
|
||||
if (result.ftsError) {
|
||||
|
|
@ -1028,7 +1030,8 @@ export abstract class MemoryManagerSyncOps {
|
|||
meta.scopeHash !== configuredScopeHash ||
|
||||
meta.chunkTokens !== this.settings.chunking.tokens ||
|
||||
meta.chunkOverlap !== this.settings.chunking.overlap ||
|
||||
(vectorReady && !meta?.vectorDims);
|
||||
(vectorReady && !meta?.vectorDims) ||
|
||||
(meta.ftsTokenizer ?? "unicode61") !== this.settings.store.fts.tokenizer;
|
||||
try {
|
||||
if (needsFullReindex) {
|
||||
if (
|
||||
|
|
@ -1220,6 +1223,7 @@ export abstract class MemoryManagerSyncOps {
|
|||
scopeHash: this.resolveConfiguredScopeHash(),
|
||||
chunkTokens: this.settings.chunking.tokens,
|
||||
chunkOverlap: this.settings.chunking.overlap,
|
||||
ftsTokenizer: this.settings.store.fts.tokenizer,
|
||||
};
|
||||
if (!nextMeta) {
|
||||
throw new Error("Failed to compute memory index metadata for reindexing.");
|
||||
|
|
@ -1292,6 +1296,7 @@ export abstract class MemoryManagerSyncOps {
|
|||
scopeHash: this.resolveConfiguredScopeHash(),
|
||||
chunkTokens: this.settings.chunking.tokens,
|
||||
chunkOverlap: this.settings.chunking.overlap,
|
||||
ftsTokenizer: this.settings.store.fts.tokenizer,
|
||||
};
|
||||
if (this.vector.available && this.vector.dims) {
|
||||
nextMeta.vectorDims = this.vector.dims;
|
||||
|
|
@ -1306,9 +1311,10 @@ export abstract class MemoryManagerSyncOps {
|
|||
this.db.exec(`DELETE FROM chunks`);
|
||||
if (this.fts.enabled && this.fts.available) {
|
||||
try {
|
||||
this.db.exec(`DELETE FROM ${FTS_TABLE}`);
|
||||
this.db.exec(`DROP TABLE IF EXISTS ${FTS_TABLE}`);
|
||||
} catch {}
|
||||
}
|
||||
this.ensureSchema();
|
||||
this.dropVectorTable();
|
||||
this.vector.dims = undefined;
|
||||
this.sessionsDirtyFiles.clear();
|
||||
|
|
|
|||
|
|
@ -352,7 +352,9 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
|||
|
||||
// Extract keywords for better FTS matching on conversational queries
|
||||
// e.g., "that thing we discussed about the API" → ["discussed", "API"]
|
||||
const keywords = extractKeywords(cleaned);
|
||||
const keywords = extractKeywords(cleaned, {
|
||||
ftsTokenizer: this.settings.store.fts.tokenizer,
|
||||
});
|
||||
const searchTerms = keywords.length > 0 ? keywords : [cleaned];
|
||||
|
||||
// Search with each keyword and merge results
|
||||
|
|
@ -488,6 +490,7 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
|||
ftsTable: FTS_TABLE,
|
||||
providerModel,
|
||||
query,
|
||||
ftsTokenizer: this.settings.store.fts.tokenizer,
|
||||
limit,
|
||||
snippetMaxChars: SNIPPET_MAX_CHARS,
|
||||
sourceFilter,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ export function ensureMemoryIndexSchema(params: {
|
|||
cacheEnabled: boolean;
|
||||
ftsTable: string;
|
||||
ftsEnabled: boolean;
|
||||
ftsTokenizer?: "unicode61" | "trigram";
|
||||
}): { ftsAvailable: boolean; ftsError?: string } {
|
||||
params.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS meta (
|
||||
|
|
@ -58,6 +59,8 @@ export function ensureMemoryIndexSchema(params: {
|
|||
let ftsError: string | undefined;
|
||||
if (params.ftsEnabled) {
|
||||
try {
|
||||
const tokenizer = params.ftsTokenizer ?? "unicode61";
|
||||
const tokenizeClause = tokenizer === "trigram" ? `, tokenize='trigram case_sensitive 0'` : "";
|
||||
params.db.exec(
|
||||
`CREATE VIRTUAL TABLE IF NOT EXISTS ${params.ftsTable} USING fts5(\n` +
|
||||
` text,\n` +
|
||||
|
|
@ -67,7 +70,7 @@ export function ensureMemoryIndexSchema(params: {
|
|||
` model UNINDEXED,\n` +
|
||||
` start_line UNINDEXED,\n` +
|
||||
` end_line UNINDEXED\n` +
|
||||
`);`,
|
||||
`${tokenizeClause});`,
|
||||
);
|
||||
ftsAvailable = true;
|
||||
} catch (err) {
|
||||
|
|
|
|||
|
|
@ -174,6 +174,51 @@ describe("extractKeywords", () => {
|
|||
const testCount = keywords.filter((k) => k === "test").length;
|
||||
expect(testCount).toBe(1);
|
||||
});
|
||||
|
||||
describe("with trigram tokenizer", () => {
|
||||
const trigramOpts = { ftsTokenizer: "trigram" as const };
|
||||
|
||||
it("emits whole CJK block instead of unigrams in trigram mode", () => {
|
||||
const defaultKeywords = extractKeywords("之前讨论的那个方案");
|
||||
const trigramKeywords = extractKeywords("之前讨论的那个方案", trigramOpts);
|
||||
// Default mode produces bigrams
|
||||
expect(defaultKeywords).toContain("讨论");
|
||||
expect(defaultKeywords).toContain("方案");
|
||||
// Trigram mode emits the whole contiguous CJK block (FTS5 trigram
|
||||
// requires >= 3 chars per term; individual characters return no results)
|
||||
expect(trigramKeywords).toContain("之前讨论的那个方案");
|
||||
expect(trigramKeywords).not.toContain("讨论");
|
||||
expect(trigramKeywords).not.toContain("方案");
|
||||
});
|
||||
|
||||
it("skips Japanese kanji bigrams in trigram mode", () => {
|
||||
const defaultKeywords = extractKeywords("経済政策について");
|
||||
const trigramKeywords = extractKeywords("経済政策について", trigramOpts);
|
||||
// Default mode adds kanji bigrams: 経済, 済政, 政策
|
||||
expect(defaultKeywords).toContain("経済");
|
||||
expect(defaultKeywords).toContain("済政");
|
||||
expect(defaultKeywords).toContain("政策");
|
||||
// Trigram mode keeps the full kanji block but skips bigram splitting
|
||||
expect(trigramKeywords).toContain("経済政策");
|
||||
expect(trigramKeywords).not.toContain("済政");
|
||||
});
|
||||
|
||||
it("still filters stop words in trigram mode", () => {
|
||||
const keywords = extractKeywords("これ それ そして どう", trigramOpts);
|
||||
expect(keywords).not.toContain("これ");
|
||||
expect(keywords).not.toContain("それ");
|
||||
expect(keywords).not.toContain("そして");
|
||||
expect(keywords).not.toContain("どう");
|
||||
});
|
||||
|
||||
it("does not affect English keyword extraction", () => {
|
||||
const keywords = extractKeywords("that thing we discussed about the API", trigramOpts);
|
||||
expect(keywords).toContain("discussed");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).not.toContain("that");
|
||||
expect(keywords).not.toContain("the");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("expandQueryForFts", () => {
|
||||
|
|
|
|||
|
|
@ -670,7 +670,8 @@ function isValidKeyword(token: string): boolean {
|
|||
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
||||
* For English, we split on whitespace and punctuation.
|
||||
*/
|
||||
function tokenize(text: string): string[] {
|
||||
function tokenize(text: string, opts?: { ftsTokenizer?: "unicode61" | "trigram" }): string[] {
|
||||
const useTrigram = opts?.ftsTokenizer === "trigram";
|
||||
const tokens: string[] = [];
|
||||
const normalized = text.toLowerCase().trim();
|
||||
|
||||
|
|
@ -686,8 +687,10 @@ function tokenize(text: string): string[] {
|
|||
for (const part of jpParts) {
|
||||
if (/^[\u4e00-\u9fff]+$/.test(part)) {
|
||||
tokens.push(part);
|
||||
for (let i = 0; i < part.length - 1; i++) {
|
||||
tokens.push(part[i] + part[i + 1]);
|
||||
if (!useTrigram) {
|
||||
for (let i = 0; i < part.length - 1; i++) {
|
||||
tokens.push(part[i] + part[i + 1]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tokens.push(part);
|
||||
|
|
@ -695,13 +698,21 @@ function tokenize(text: string): string[] {
|
|||
}
|
||||
} else if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||
// Check if segment contains CJK characters (Chinese)
|
||||
// For Chinese, extract character n-grams (unigrams and bigrams)
|
||||
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
||||
// Add individual characters
|
||||
tokens.push(...chars);
|
||||
// Add bigrams for better phrase matching
|
||||
for (let i = 0; i < chars.length - 1; i++) {
|
||||
tokens.push(chars[i] + chars[i + 1]);
|
||||
if (useTrigram) {
|
||||
// In trigram mode, push the whole contiguous CJK block (mirroring the
|
||||
// Japanese kanji path). SQLite's trigram FTS requires at least 3 characters
|
||||
// per query term — individual characters silently return no results.
|
||||
const block = chars.join("");
|
||||
if (block.length > 0) {
|
||||
tokens.push(block);
|
||||
}
|
||||
} else {
|
||||
// Default mode: unigrams + bigrams for phrase matching
|
||||
tokens.push(...chars);
|
||||
for (let i = 0; i < chars.length - 1; i++) {
|
||||
tokens.push(chars[i] + chars[i + 1]);
|
||||
}
|
||||
}
|
||||
} else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
|
||||
// For Korean (Hangul syllables and jamo), keep the word as-is unless it is
|
||||
|
|
@ -732,8 +743,11 @@ function tokenize(text: string): string[] {
|
|||
* - "之前讨论的那个方案" → ["讨论", "方案"]
|
||||
* - "what was the solution for the bug" → ["solution", "bug"]
|
||||
*/
|
||||
export function extractKeywords(query: string): string[] {
|
||||
const tokens = tokenize(query);
|
||||
export function extractKeywords(
|
||||
query: string,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): string[] {
|
||||
const tokens = tokenize(query, opts);
|
||||
const keywords: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
|
|
@ -764,13 +778,16 @@ export function extractKeywords(query: string): string[] {
|
|||
* @param query - User's original query
|
||||
* @returns Object with original query and extracted keywords
|
||||
*/
|
||||
export function expandQueryForFts(query: string): {
|
||||
export function expandQueryForFts(
|
||||
query: string,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): {
|
||||
original: string;
|
||||
keywords: string[];
|
||||
expanded: string;
|
||||
} {
|
||||
const original = query.trim();
|
||||
const keywords = extractKeywords(original);
|
||||
const keywords = extractKeywords(original, opts);
|
||||
|
||||
// Build expanded query: original terms OR extracted keywords
|
||||
// This ensures both exact matches and keyword matches are found
|
||||
|
|
@ -792,6 +809,7 @@ export type LlmQueryExpander = (query: string) => Promise<string[]>;
|
|||
export async function expandQueryWithLlm(
|
||||
query: string,
|
||||
llmExpander?: LlmQueryExpander,
|
||||
opts?: { ftsTokenizer?: "unicode61" | "trigram" },
|
||||
): Promise<string[]> {
|
||||
// If LLM expander is provided, try it first
|
||||
if (llmExpander) {
|
||||
|
|
@ -806,5 +824,5 @@ export async function expandQueryWithLlm(
|
|||
}
|
||||
|
||||
// Fall back to local keyword extraction
|
||||
return extractKeywords(query);
|
||||
return extractKeywords(query, opts);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,9 @@ export type ResolvedMemorySearchConfig = {
|
|||
store: {
|
||||
driver: "sqlite";
|
||||
path: string;
|
||||
fts: {
|
||||
tokenizer: "unicode61" | "trigram";
|
||||
};
|
||||
vector: {
|
||||
enabled: boolean;
|
||||
extensionPath?: string;
|
||||
|
|
@ -206,9 +209,13 @@ function mergeConfig(
|
|||
extensionPath:
|
||||
overrides?.store?.vector?.extensionPath ?? defaults?.store?.vector?.extensionPath,
|
||||
};
|
||||
const fts = {
|
||||
tokenizer: overrides?.store?.fts?.tokenizer ?? defaults?.store?.fts?.tokenizer ?? "unicode61",
|
||||
};
|
||||
const store = {
|
||||
driver: overrides?.store?.driver ?? defaults?.store?.driver ?? "sqlite",
|
||||
path: resolveStorePath(agentId, overrides?.store?.path ?? defaults?.store?.path),
|
||||
fts,
|
||||
vector,
|
||||
};
|
||||
const chunking = {
|
||||
|
|
|
|||
|
|
@ -2033,6 +2033,24 @@ export const GENERATED_BASE_CONFIG_SCHEMA = {
|
|||
path: {
|
||||
type: "string",
|
||||
},
|
||||
fts: {
|
||||
type: "object",
|
||||
properties: {
|
||||
tokenizer: {
|
||||
anyOf: [
|
||||
{
|
||||
type: "string",
|
||||
const: "unicode61",
|
||||
},
|
||||
{
|
||||
type: "string",
|
||||
const: "trigram",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
additionalProperties: false,
|
||||
},
|
||||
vector: {
|
||||
type: "object",
|
||||
properties: {
|
||||
|
|
@ -3596,6 +3614,24 @@ export const GENERATED_BASE_CONFIG_SCHEMA = {
|
|||
path: {
|
||||
type: "string",
|
||||
},
|
||||
fts: {
|
||||
type: "object",
|
||||
properties: {
|
||||
tokenizer: {
|
||||
anyOf: [
|
||||
{
|
||||
type: "string",
|
||||
const: "unicode61",
|
||||
},
|
||||
{
|
||||
type: "string",
|
||||
const: "trigram",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
additionalProperties: false,
|
||||
},
|
||||
vector: {
|
||||
type: "object",
|
||||
properties: {
|
||||
|
|
|
|||
|
|
@ -379,6 +379,10 @@ export type MemorySearchConfig = {
|
|||
store?: {
|
||||
driver?: "sqlite";
|
||||
path?: string;
|
||||
fts?: {
|
||||
/** FTS5 tokenizer (default: "unicode61"). Use "trigram" for CJK text support. */
|
||||
tokenizer?: "unicode61" | "trigram";
|
||||
};
|
||||
vector?: {
|
||||
/** Enable sqlite-vec extension for vector search (default: true). */
|
||||
enabled?: boolean;
|
||||
|
|
|
|||
|
|
@ -653,6 +653,12 @@ export const MemorySearchSchema = z
|
|||
.object({
|
||||
driver: z.literal("sqlite").optional(),
|
||||
path: z.string().optional(),
|
||||
fts: z
|
||||
.object({
|
||||
tokenizer: z.union([z.literal("unicode61"), z.literal("trigram")]).optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
vector: z
|
||||
.object({
|
||||
enabled: z.boolean().optional(),
|
||||
|
|
|
|||
Loading…
Reference in New Issue