diff --git a/migrations/034_hybrid_search.sql b/migrations/034_hybrid_search.sql new file mode 100644 index 0000000..5f91b76 --- /dev/null +++ b/migrations/034_hybrid_search.sql @@ -0,0 +1,53 @@ +-- CF-1315: Hybrid search - tsvector columns, GIN indexes, triggers + +-- 1. Add search_vector columns +ALTER TABLE project_archives ADD COLUMN IF NOT EXISTS search_vector tsvector; +ALTER TABLE memories ADD COLUMN IF NOT EXISTS search_vector tsvector; +ALTER TABLE sessions ADD COLUMN IF NOT EXISTS search_vector tsvector; + +-- 2. GIN indexes for fast full-text search +CREATE INDEX IF NOT EXISTS idx_archives_search_vector ON project_archives USING gin(search_vector); +CREATE INDEX IF NOT EXISTS idx_memories_search_vector ON memories USING gin(search_vector); +CREATE INDEX IF NOT EXISTS idx_sessions_search_vector ON sessions USING gin(search_vector); + +-- 3. Triggers to auto-populate search_vector on INSERT/UPDATE +CREATE OR REPLACE FUNCTION update_archives_search_vector() RETURNS TRIGGER AS $$ +BEGIN + NEW.search_vector := to_tsvector('english', coalesce(NEW.title, '') || ' ' || coalesce(NEW.content, '')); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION update_memories_search_vector() RETURNS TRIGGER AS $$ +BEGIN + NEW.search_vector := to_tsvector('english', coalesce(NEW.title, '') || ' ' || coalesce(NEW.content, '')); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION update_sessions_search_vector() RETURNS TRIGGER AS $$ +BEGIN + NEW.search_vector := to_tsvector('english', coalesce(NEW.summary, '')); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS trg_archives_search_vector ON project_archives; +CREATE TRIGGER trg_archives_search_vector + BEFORE INSERT OR UPDATE OF title, content ON project_archives + FOR EACH ROW EXECUTE FUNCTION update_archives_search_vector(); + +DROP TRIGGER IF EXISTS trg_memories_search_vector ON memories; +CREATE TRIGGER trg_memories_search_vector + BEFORE INSERT OR UPDATE OF title, content ON memories + FOR EACH ROW EXECUTE FUNCTION update_memories_search_vector(); + +DROP TRIGGER IF EXISTS trg_sessions_search_vector ON sessions; +CREATE TRIGGER trg_sessions_search_vector + BEFORE INSERT OR UPDATE OF summary ON sessions + FOR EACH ROW EXECUTE FUNCTION update_sessions_search_vector(); + +-- 4. Backfill existing rows (no-op if tables empty, safe to re-run) +UPDATE project_archives SET search_vector = to_tsvector('english', coalesce(title, '') || ' ' || coalesce(content, '')) WHERE search_vector IS NULL; +UPDATE memories SET search_vector = to_tsvector('english', coalesce(title, '') || ' ' || coalesce(content, '')) WHERE search_vector IS NULL; +UPDATE sessions SET search_vector = to_tsvector('english', coalesce(summary, '')) WHERE search_vector IS NULL AND summary IS NOT NULL; diff --git a/src/embeddings.ts b/src/embeddings.ts index 083e802..ecd86a5 100644 --- a/src/embeddings.ts +++ b/src/embeddings.ts @@ -67,3 +67,29 @@ export async function getEmbedding(text: string): Promise { export function formatEmbedding(embedding: number[]): string { return `[${embedding.join(',')}]`; } + +/** + * Reciprocal Rank Fusion — merge two ranked result lists (CF-1315) + * @param vectorResults IDs ranked by vector similarity (best first) + * @param keywordResults IDs ranked by ts_rank (best first) + * @param k RRF parameter (default 60, standard) + * @returns Merged IDs sorted by RRF score descending + */ +export function rrfMerge( + vectorResults: (number | string)[], + keywordResults: (number | string)[], + k: number = 60 +): { id: number | string; score: number }[] { + const scores = new Map(); + + vectorResults.forEach((id, rank) => { + scores.set(id, (scores.get(id) || 0) + 1 / (k + rank + 1)); + }); + keywordResults.forEach((id, rank) => { + scores.set(id, (scores.get(id) || 0) + 1 / (k + rank + 1)); + }); + + return Array.from(scores.entries()) + .map(([id, score]) => ({ id, score })) + .sort((a, b) => b.score - a.score); +} diff --git a/src/index.ts b/src/index.ts index 44f4d16..1307dfd 100644 --- a/src/index.ts +++ b/src/index.ts @@ -247,6 +247,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { project: a.project, category: a.category, limit: a.limit, + search_mode: a.search_mode, }); break; case 'memory_list': @@ -337,6 +338,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { query: a.query, project: a.project, limit: a.limit, + search_mode: a.search_mode, }); break; case 'session_context': @@ -454,6 +456,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { query: a.query, project: a.project, limit: a.limit, + search_mode: a.search_mode, }), null, 2 @@ -499,6 +502,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => { project: a.project, archive_type: a.archive_type, limit: a.limit, + search_mode: a.search_mode, }); break; case 'archive_list': diff --git a/src/tools/archives.ts b/src/tools/archives.ts index 43e63d0..d8aabaf 100644 --- a/src/tools/archives.ts +++ b/src/tools/archives.ts @@ -1,7 +1,7 @@ // Project archives operations for database-backed archival import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js'; type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration'; @@ -31,11 +31,14 @@ interface ArchiveAddArgs { metadata?: Record; } +type SearchMode = 'hybrid' | 'vector' | 'keyword'; + interface ArchiveSearchArgs { query: string; project?: string; archive_type?: ArchiveType; limit?: number; + search_mode?: SearchMode; } interface ArchiveListArgs { @@ -111,97 +114,107 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise { } /** - * Search archives semantically + * Search archives with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315) */ export async function archiveSearch(args: ArchiveSearchArgs): Promise { - const { query: searchQuery, project, archive_type, limit = 5 } = args; - - // Generate embedding for search - const embedding = await getEmbedding(searchQuery); - - // Fallback to text search if embeddings unavailable - if (!embedding) { - console.warn('Embeddings unavailable, falling back to text search'); - - let whereClause = '(title ILIKE $1 OR content ILIKE $1)'; - const params: unknown[] = [`%${searchQuery}%`]; - let paramIndex = 2; + const { query: searchQuery, project, archive_type, limit = 5, search_mode = 'hybrid' } = args; + // Build shared filter clause + const buildFilter = (startIdx: number) => { + let where = ''; + const params: unknown[] = []; + let idx = startIdx; if (project) { - whereClause += ` AND project_key = $${paramIndex++}`; + where += ` AND project_key = $${idx++}`; params.push(project); } if (archive_type) { - whereClause += ` AND archive_type = $${paramIndex++}`; + where += ` AND archive_type = $${idx++}`; params.push(archive_type); } + return { where, params, nextIdx: idx }; + }; - params.push(limit); + // Vector search + let vectorIds: number[] = []; + let vectorRows: Map = new Map(); + let embeddingFailed = false; - const archives = await query( + if (search_mode !== 'keyword') { + const embedding = await getEmbedding(searchQuery); + if (embedding) { + const embeddingStr = formatEmbedding(embedding); + const filter = buildFilter(3); + const params: unknown[] = [embeddingStr, limit, ...filter.params]; + + const rows = await query( + `SELECT id, archive_type, title, original_path, file_size, + to_char(archived_at, 'YYYY-MM-DD') as archived_at, + 1 - (embedding <=> $1) as similarity + FROM project_archives + WHERE embedding IS NOT NULL${filter.where} + ORDER BY embedding <=> $1 + LIMIT $2`, + params + ); + vectorIds = rows.map(r => r.id); + for (const r of rows) vectorRows.set(r.id, r); + } else { + embeddingFailed = true; + if (search_mode === 'vector') { + return 'Error: Could not generate embedding for vector search'; + } + } + } + + // Keyword search + let keywordIds: number[] = []; + let keywordRows: Map = new Map(); + + if (search_mode !== 'vector') { + const filter = buildFilter(3); + const params: unknown[] = [searchQuery, limit, ...filter.params]; + + const rows = await query( `SELECT id, archive_type, title, original_path, file_size, - to_char(archived_at, 'YYYY-MM-DD') as archived_at + to_char(archived_at, 'YYYY-MM-DD') as archived_at, + ts_rank(search_vector, plainto_tsquery('english', $1)) as rank FROM project_archives - WHERE ${whereClause} - ORDER BY archived_at DESC - LIMIT $${paramIndex}`, + WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where} + ORDER BY rank DESC + LIMIT $2`, params ); - - if (archives.length === 0) { - return 'No relevant archives found'; - } - - const lines = ['Relevant archives (text search - embeddings unavailable):\n']; - for (const a of archives) { - const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : ''; - lines.push(`**[${a.archive_type}]** ${a.title}`); - lines.push(` Archived: ${a.archived_at}${sizeStr}`); - if (a.original_path) { - lines.push(` Path: ${a.original_path}`); - } - lines.push(''); - } - - return lines.join('\n'); + keywordIds = rows.map(r => r.id); + for (const r of rows) keywordRows.set(r.id, r); } - // Semantic search with embeddings - const embeddingStr = formatEmbedding(embedding); + // Merge results + let finalIds: number[]; + let searchLabel: string; - let whereClause = 'WHERE embedding IS NOT NULL'; - const params: unknown[] = [embeddingStr, limit]; - let paramIndex = 3; - - if (project) { - whereClause += ` AND project_key = $${paramIndex++}`; - params.splice(params.length - 1, 0, project); - } - if (archive_type) { - whereClause += ` AND archive_type = $${paramIndex++}`; - params.splice(params.length - 1, 0, archive_type); - } - - const archives = await query( - `SELECT id, archive_type, title, original_path, file_size, - to_char(archived_at, 'YYYY-MM-DD') as archived_at, - 1 - (embedding <=> $1) as similarity - FROM project_archives - ${whereClause} - ORDER BY embedding <=> $1 - LIMIT $2`, - params - ); - - if (archives.length === 0) { + if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) { + const merged = rrfMerge(vectorIds, keywordIds); + finalIds = merged.slice(0, limit).map(m => m.id as number); + searchLabel = 'hybrid'; + } else if (vectorIds.length > 0) { + finalIds = vectorIds; + searchLabel = 'vector'; + } else if (keywordIds.length > 0) { + finalIds = keywordIds; + searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword'; + } else { return 'No relevant archives found'; } - const lines = ['Relevant archives:\n']; - for (const a of archives) { - const sim = Math.round(a.similarity * 100); + // Format output + const lines = [`Relevant archives (${searchLabel}):\n`]; + for (const id of finalIds) { + const a = vectorRows.get(id) || keywordRows.get(id); + if (!a) continue; + const sim = vectorRows.has(id) ? ` (${Math.round((vectorRows.get(id)!).similarity * 100)}% match)` : ''; const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : ''; - lines.push(`**[${a.archive_type}]** ${a.title} (${sim}% match)`); + lines.push(`**[${a.archive_type}]** ${a.title}${sim}`); lines.push(` Archived: ${a.archived_at}${sizeStr}`); if (a.original_path) { lines.push(` Path: ${a.original_path}`); diff --git a/src/tools/index.ts b/src/tools/index.ts index 31e4a40..e60b5b2 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -318,7 +318,7 @@ export const toolDefinitions = [ }, { name: 'memory_search', - description: 'Search memories semantically.', + description: 'Search memories using hybrid (vector + keyword), vector-only, or keyword-only search.', inputSchema: { type: 'object', properties: { @@ -326,6 +326,7 @@ export const toolDefinitions = [ project: { type: 'string', description: 'Filter by project (optional)' }, category: { type: 'string', enum: ['pattern', 'fix', 'preference', 'gotcha', 'architecture'], description: 'Filter by category (optional)' }, limit: { type: 'number', description: 'Max results (default: 5)' }, + search_mode: { type: 'string', enum: ['hybrid', 'vector', 'keyword'], description: 'Search mode (default: hybrid)' }, }, required: ['query'], }, @@ -479,13 +480,14 @@ export const toolDefinitions = [ }, { name: 'session_search', - description: 'Find similar sessions using vector search', + description: 'Find similar sessions using hybrid (vector + keyword), vector-only, or keyword-only search.', inputSchema: { type: 'object', properties: { query: { type: 'string', description: 'Search query' }, project: { type: 'string', description: 'Filter by project (optional)' }, limit: { type: 'number', description: 'Max results (default: 5)' }, + search_mode: { type: 'string', enum: ['hybrid', 'vector', 'keyword'], description: 'Search mode (default: hybrid)' }, }, required: ['query'], }, @@ -670,13 +672,14 @@ export const toolDefinitions = [ }, { name: 'session_semantic_search', - description: 'Semantic search across all session documentation', + description: 'Search across all session documentation using hybrid (vector + keyword), vector-only, or keyword-only search.', inputSchema: { type: 'object', properties: { query: { type: 'string', description: 'Search query' }, project: { type: 'string', description: 'Filter by project (optional)' }, limit: { type: 'number', description: 'Max results (default: 10)' }, + search_mode: { type: 'string', enum: ['hybrid', 'vector', 'keyword'], description: 'Search mode (default: hybrid)' }, }, required: ['query'], }, @@ -725,7 +728,7 @@ export const toolDefinitions = [ }, { name: 'archive_search', - description: 'Search archives using semantic similarity', + description: 'Search archives using hybrid (vector + keyword), vector-only, or keyword-only search.', inputSchema: { type: 'object', properties: { @@ -733,6 +736,7 @@ export const toolDefinitions = [ project: { type: 'string', description: 'Filter by project (optional)' }, archive_type: { type: 'string', enum: ['session', 'research', 'audit', 'investigation', 'completed', 'migration'], description: 'Filter by archive type (optional)' }, limit: { type: 'number', description: 'Max results (default: 5)' }, + search_mode: { type: 'string', enum: ['hybrid', 'vector', 'keyword'], description: 'Search mode (default: hybrid)' }, }, required: ['query'], }, diff --git a/src/tools/memories.ts b/src/tools/memories.ts index 683e684..7c9b28a 100644 --- a/src/tools/memories.ts +++ b/src/tools/memories.ts @@ -1,7 +1,7 @@ // Session memory operations for persistent learnings import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js'; type MemoryCategory = 'pattern' | 'fix' | 'preference' | 'gotcha' | 'architecture'; @@ -28,11 +28,14 @@ interface MemoryAddArgs { task_id?: string; } +type SearchMode = 'hybrid' | 'vector' | 'keyword'; + interface MemorySearchArgs { query: string; project?: string; category?: MemoryCategory; limit?: number; + search_mode?: SearchMode; } interface MemoryListArgs { @@ -93,60 +96,113 @@ export async function memoryAdd(args: MemoryAddArgs): Promise { } /** - * Search memories semantically + * Search memories with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315) */ export async function memorySearch(args: MemorySearchArgs): Promise { - const { query: searchQuery, project, category, limit = 5 } = args; + const { query: searchQuery, project, category, limit = 5, search_mode = 'hybrid' } = args; - // Generate embedding for search - const embedding = await getEmbedding(searchQuery); + // Build shared filter clause + const buildFilter = (startIdx: number) => { + let where = ''; + const params: unknown[] = []; + let idx = startIdx; + if (project) { + where += ` AND (project = $${idx++} OR project IS NULL)`; + params.push(project); + } + if (category) { + where += ` AND category = $${idx++}`; + params.push(category); + } + return { where, params, nextIdx: idx }; + }; - if (!embedding) { - return 'Error: Could not generate embedding for search'; + // Vector search + let vectorIds: number[] = []; + let vectorRows: Map = new Map(); + let embeddingFailed = false; + + if (search_mode !== 'keyword') { + const embedding = await getEmbedding(searchQuery); + if (embedding) { + const embeddingStr = formatEmbedding(embedding); + const filter = buildFilter(3); + const params: unknown[] = [embeddingStr, limit, ...filter.params]; + + const rows = await query( + `SELECT id, category, title, content, context, project, access_count, + to_char(created_at, 'YYYY-MM-DD') as created_at, + 1 - (embedding <=> $1) as similarity + FROM memories + WHERE embedding IS NOT NULL${filter.where} + ORDER BY embedding <=> $1 + LIMIT $2`, + params + ); + vectorIds = rows.map(r => r.id); + for (const r of rows) vectorRows.set(r.id, r); + } else { + embeddingFailed = true; + if (search_mode === 'vector') { + return 'Error: Could not generate embedding for vector search'; + } + } } - const embeddingStr = formatEmbedding(embedding); + // Keyword search + let keywordIds: number[] = []; + let keywordRows: Map = new Map(); - let whereClause = 'WHERE embedding IS NOT NULL'; - const params: unknown[] = [embeddingStr, limit]; - let paramIndex = 3; + if (search_mode !== 'vector') { + const filter = buildFilter(3); + const params: unknown[] = [searchQuery, limit, ...filter.params]; - if (project) { - whereClause += ` AND (project = $${paramIndex++} OR project IS NULL)`; - params.splice(params.length - 1, 0, project); - } - if (category) { - whereClause += ` AND category = $${paramIndex++}`; - params.splice(params.length - 1, 0, category); + const rows = await query( + `SELECT id, category, title, content, context, project, access_count, + to_char(created_at, 'YYYY-MM-DD') as created_at, + ts_rank(search_vector, plainto_tsquery('english', $1)) as rank + FROM memories + WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where} + ORDER BY rank DESC + LIMIT $2`, + params + ); + keywordIds = rows.map(r => r.id); + for (const r of rows) keywordRows.set(r.id, r); } - const memories = await query( - `SELECT id, category, title, content, context, project, access_count, - to_char(created_at, 'YYYY-MM-DD') as created_at, - 1 - (embedding <=> $1) as similarity - FROM memories - ${whereClause} - ORDER BY embedding <=> $1 - LIMIT $2`, - params - ); + // Merge results + let finalIds: number[]; + let searchLabel: string; - if (memories.length === 0) { + if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) { + const merged = rrfMerge(vectorIds, keywordIds); + finalIds = merged.slice(0, limit).map(m => m.id as number); + searchLabel = 'hybrid'; + } else if (vectorIds.length > 0) { + finalIds = vectorIds; + searchLabel = 'vector'; + } else if (keywordIds.length > 0) { + finalIds = keywordIds; + searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword'; + } else { return 'No relevant memories found'; } // Update access_count for returned memories - const ids = memories.map(m => m.id); await execute( `UPDATE memories SET access_count = access_count + 1, last_accessed_at = NOW() WHERE id = ANY($1)`, - [ids] + [finalIds] ); - const lines = ['Relevant memories:\n']; - for (const m of memories) { - const sim = Math.round(m.similarity * 100); + // Format output + const lines = [`Relevant memories (${searchLabel}):\n`]; + for (const id of finalIds) { + const m = vectorRows.get(id) || keywordRows.get(id); + if (!m) continue; + const sim = vectorRows.has(id) ? ` (${Math.round((vectorRows.get(id)!).similarity * 100)}% match)` : ''; const proj = m.project ? ` [${m.project}]` : ''; - lines.push(`**[${m.category}]${proj}** ${m.title} (${sim}% match)`); + lines.push(`**[${m.category}]${proj}** ${m.title}${sim}`); lines.push(` ${m.content}`); if (m.context) { lines.push(` _Context: ${m.context}_`); diff --git a/src/tools/session-docs.ts b/src/tools/session-docs.ts index 8d88209..07a65c0 100644 --- a/src/tools/session-docs.ts +++ b/src/tools/session-docs.ts @@ -2,7 +2,7 @@ // Replaces file-based CLAUDE.md and plan files with database storage import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js'; import { getSessionId } from './session-id.js'; // ============================================================================ @@ -451,10 +451,13 @@ export async function sessionDocumentationGenerate(args: SessionDocumentationGen // SEMANTIC SEARCH & ANALYTICS // ============================================================================ +type SearchMode = 'hybrid' | 'vector' | 'keyword'; + interface SessionSemanticSearchArgs { query: string; project?: string; limit?: number; + search_mode?: SearchMode; } interface SessionSearchResult { @@ -467,60 +470,96 @@ interface SessionSearchResult { } /** - * Semantic search across all session documentation - * Uses vector similarity to find related sessions + * Semantic search across all session documentation with hybrid/vector/keyword modes (CF-1315) */ export async function sessionSemanticSearch(args: SessionSemanticSearchArgs): Promise { - const { query: searchQuery, project, limit = 10 } = args; + const { query: searchQuery, project, limit = 10, search_mode = 'hybrid' } = args; - // Generate embedding for search query - const queryEmbedding = await getEmbedding(searchQuery); + // Build shared filter clause + const buildFilter = (startIdx: number) => { + let where = ''; + const params: unknown[] = []; + let idx = startIdx; + if (project) { + where += ` AND s.project = $${idx++}`; + params.push(project); + } + return { where, params, nextIdx: idx }; + }; - if (!queryEmbedding) { - // Fallback to text search if embedding generation fails - let sql = ` - SELECT - s.id as session_id, - s.session_number, - s.project, - s.summary, - s.started_at, - 0.5 as similarity - FROM sessions s - WHERE s.summary IS NOT NULL - AND s.status = 'completed' - ${project ? 'AND s.project = $1' : ''} - AND s.summary ILIKE $${project ? '2' : '1'} - ORDER BY s.started_at DESC - LIMIT $${project ? '3' : '2'} - `; + // Vector search + let vectorIds: string[] = []; + let vectorRows: Map = new Map(); + let embeddingFailed = false; - const params: unknown[] = project ? [project, `%${searchQuery}%`, limit] : [`%${searchQuery}%`, limit]; - const results = await query(sql, params); - return results; + if (search_mode !== 'keyword') { + const queryEmbedding = await getEmbedding(searchQuery); + if (queryEmbedding) { + const embeddingFormatted = formatEmbedding(queryEmbedding); + const filter = buildFilter(3); + const params: unknown[] = [embeddingFormatted, limit, ...filter.params]; + + const rows = await query( + `SELECT s.id as session_id, s.session_number, s.project, s.summary, s.started_at, + 1 - (s.embedding <=> $1) as similarity + FROM sessions s + WHERE s.embedding IS NOT NULL AND s.status = 'completed'${filter.where} + ORDER BY s.embedding <=> $1 + LIMIT $2`, + params + ); + vectorIds = rows.map(r => r.session_id); + for (const r of rows) vectorRows.set(r.session_id, r); + } else { + embeddingFailed = true; + if (search_mode === 'vector') { + return []; + } + } } - const embeddingFormatted = formatEmbedding(queryEmbedding); + // Keyword search + let keywordIds: string[] = []; + let keywordRows: Map = new Map(); - // Vector similarity search - let sql = ` - SELECT - s.id as session_id, - s.session_number, - s.project, - s.summary, - s.started_at, - 1 - (s.embedding <=> $1) as similarity - FROM sessions s - WHERE s.embedding IS NOT NULL - ${project ? 'AND s.project = $2' : ''} - AND s.status = 'completed' - ORDER BY s.embedding <=> $1 - LIMIT $${project ? '3' : '2'} - `; + if (search_mode !== 'vector') { + const filter = buildFilter(3); + const params: unknown[] = [searchQuery, limit, ...filter.params]; - const params: unknown[] = project ? [embeddingFormatted, project, limit] : [embeddingFormatted, limit]; - const results = await query(sql, params); + const rows = await query( + `SELECT s.id as session_id, s.session_number, s.project, s.summary, s.started_at, + ts_rank(s.search_vector, plainto_tsquery('english', $1)) as similarity + FROM sessions s + WHERE s.search_vector @@ plainto_tsquery('english', $1) + AND s.status = 'completed'${filter.where} + ORDER BY similarity DESC + LIMIT $2`, + params + ); + keywordIds = rows.map(r => r.session_id); + for (const r of rows) keywordRows.set(r.session_id, r); + } + + // Merge results + let finalIds: string[]; + + if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) { + const merged = rrfMerge(vectorIds, keywordIds); + finalIds = merged.slice(0, limit).map(m => m.id as string); + } else if (vectorIds.length > 0) { + finalIds = vectorIds; + } else if (keywordIds.length > 0) { + finalIds = keywordIds; + } else { + return []; + } + + // Build final results preserving original similarity scores + const results: SessionSearchResult[] = []; + for (const id of finalIds) { + const r = vectorRows.get(id) || keywordRows.get(id); + if (r) results.push(r); + } return results; } diff --git a/src/tools/sessions.ts b/src/tools/sessions.ts index 2947b2c..7d5c0b3 100644 --- a/src/tools/sessions.ts +++ b/src/tools/sessions.ts @@ -2,7 +2,7 @@ // Sessions auto-create CF Jira issues and post output on close (CF-762) import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js'; import { createSessionIssue, addComment, transitionToDone, updateIssueDescription } from '../services/jira.js'; interface SessionStartArgs { @@ -34,10 +34,13 @@ interface SessionListArgs { limit?: number; } +type SearchMode = 'hybrid' | 'vector' | 'keyword'; + interface SessionSearchArgs { query: string; project?: string; limit?: number; + search_mode?: SearchMode; } interface Session { @@ -336,49 +339,102 @@ export async function sessionList(args: SessionListArgs): Promise { } /** - * Semantic search across sessions using vector similarity + * Search sessions with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315) */ export async function sessionSearch(args: SessionSearchArgs): Promise { - const { query: searchQuery, project, limit = 5 } = args; + const { query: searchQuery, project, limit = 5, search_mode = 'hybrid' } = args; - // Generate embedding for search - const embedding = await getEmbedding(searchQuery); + // Build shared filter clause + const buildFilter = (startIdx: number) => { + let where = ''; + const params: unknown[] = []; + let idx = startIdx; + if (project) { + where += ` AND project = $${idx++}`; + params.push(project); + } + return { where, params, nextIdx: idx }; + }; - if (!embedding) { - return 'Error: Could not generate embedding for search'; + // Vector search + let vectorIds: string[] = []; + let vectorRows: Map = new Map(); + let embeddingFailed = false; + + if (search_mode !== 'keyword') { + const embedding = await getEmbedding(searchQuery); + if (embedding) { + const embeddingStr = formatEmbedding(embedding); + const filter = buildFilter(3); + const params: unknown[] = [embeddingStr, limit, ...filter.params]; + + const rows = await query( + `SELECT id, project, session_number, started_at, duration_minutes, summary, + 1 - (embedding <=> $1) as similarity + FROM sessions + WHERE embedding IS NOT NULL${filter.where} + ORDER BY embedding <=> $1 + LIMIT $2`, + params + ); + vectorIds = rows.map(r => r.id); + for (const r of rows) vectorRows.set(r.id, r); + } else { + embeddingFailed = true; + if (search_mode === 'vector') { + return 'Error: Could not generate embedding for vector search'; + } + } } - const embeddingStr = formatEmbedding(embedding); + // Keyword search + let keywordIds: string[] = []; + let keywordRows: Map = new Map(); - let whereClause = 'WHERE embedding IS NOT NULL'; - const params: unknown[] = [embeddingStr, limit]; + if (search_mode !== 'vector') { + const filter = buildFilter(3); + const params: unknown[] = [searchQuery, limit, ...filter.params]; - if (project) { - whereClause += ` AND project = $3`; - params.splice(1, 0, project); // Insert before limit - params[2] = limit; // Adjust limit position + const rows = await query( + `SELECT id, project, session_number, started_at, duration_minutes, summary, + ts_rank(search_vector, plainto_tsquery('english', $1)) as rank + FROM sessions + WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where} + ORDER BY rank DESC + LIMIT $2`, + params + ); + keywordIds = rows.map(r => r.id); + for (const r of rows) keywordRows.set(r.id, r); } - const sessions = await query( - `SELECT id, project, session_number, started_at, duration_minutes, summary, - 1 - (embedding <=> $1) as similarity - FROM sessions - ${whereClause} - ORDER BY embedding <=> $1 - LIMIT $${project ? '3' : '2'}`, - params - ); + // Merge results + let finalIds: string[]; + let searchLabel: string; - if (sessions.length === 0) { + if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) { + const merged = rrfMerge(vectorIds, keywordIds); + finalIds = merged.slice(0, limit).map(m => m.id as string); + searchLabel = 'hybrid'; + } else if (vectorIds.length > 0) { + finalIds = vectorIds; + searchLabel = 'vector'; + } else if (keywordIds.length > 0) { + finalIds = keywordIds; + searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword'; + } else { return 'No relevant sessions found'; } - const lines = ['Similar sessions:\n']; - for (const s of sessions) { - const sim = Math.round(s.similarity * 100); + // Format output + const lines = [`Similar sessions (${searchLabel}):\n`]; + for (const id of finalIds) { + const s = vectorRows.get(id) || keywordRows.get(id); + if (!s) continue; + const sim = vectorRows.has(id) ? ` (${Math.round((vectorRows.get(id)!).similarity * 100)}% match)` : ''; const num = s.session_number ? `#${s.session_number}` : ''; const duration = s.duration_minutes ? `(${s.duration_minutes}m)` : ''; - lines.push(`**${s.project} ${num}** ${duration} (${sim}% match)`); + lines.push(`**${s.project} ${num}** ${duration}${sim}`); lines.push(` ${s.summary || 'No summary'}`); lines.push(''); }