feat(CF-1315): Hybrid search with tsvector + RRF

Add PostgreSQL full-text search alongside pgvector for exact matches
on Jira keys, error messages, file paths. Merge results with
Reciprocal Rank Fusion. Default mode: hybrid, with graceful
degradation to keyword-only when embeddings unavailable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 08:46:39 +02:00
parent 1f499bd926
commit 4f8996cd82
8 changed files with 434 additions and 183 deletions

View File

@@ -2,7 +2,7 @@
// Replaces file-based CLAUDE.md and plan files with database storage
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js';
import { getSessionId } from './session-id.js';
// ============================================================================
@@ -451,10 +451,13 @@ export async function sessionDocumentationGenerate(args: SessionDocumentationGen
// SEMANTIC SEARCH & ANALYTICS
// ============================================================================
type SearchMode = 'hybrid' | 'vector' | 'keyword';
interface SessionSemanticSearchArgs {
query: string;
project?: string;
limit?: number;
search_mode?: SearchMode;
}
interface SessionSearchResult {
@@ -467,60 +470,96 @@ interface SessionSearchResult {
}
/**
* Semantic search across all session documentation
* Uses vector similarity to find related sessions
* Semantic search across all session documentation with hybrid/vector/keyword modes (CF-1315)
*/
export async function sessionSemanticSearch(args: SessionSemanticSearchArgs): Promise<SessionSearchResult[]> {
const { query: searchQuery, project, limit = 10 } = args;
const { query: searchQuery, project, limit = 10, search_mode = 'hybrid' } = args;
// Generate embedding for search query
const queryEmbedding = await getEmbedding(searchQuery);
// Build shared filter clause
const buildFilter = (startIdx: number) => {
let where = '';
const params: unknown[] = [];
let idx = startIdx;
if (project) {
where += ` AND s.project = $${idx++}`;
params.push(project);
}
return { where, params, nextIdx: idx };
};
if (!queryEmbedding) {
// Fallback to text search if embedding generation fails
let sql = `
SELECT
s.id as session_id,
s.session_number,
s.project,
s.summary,
s.started_at,
0.5 as similarity
FROM sessions s
WHERE s.summary IS NOT NULL
AND s.status = 'completed'
${project ? 'AND s.project = $1' : ''}
AND s.summary ILIKE $${project ? '2' : '1'}
ORDER BY s.started_at DESC
LIMIT $${project ? '3' : '2'}
`;
// Vector search
let vectorIds: string[] = [];
let vectorRows: Map<string, SessionSearchResult> = new Map();
let embeddingFailed = false;
const params: unknown[] = project ? [project, `%${searchQuery}%`, limit] : [`%${searchQuery}%`, limit];
const results = await query<SessionSearchResult>(sql, params);
return results;
if (search_mode !== 'keyword') {
const queryEmbedding = await getEmbedding(searchQuery);
if (queryEmbedding) {
const embeddingFormatted = formatEmbedding(queryEmbedding);
const filter = buildFilter(3);
const params: unknown[] = [embeddingFormatted, limit, ...filter.params];
const rows = await query<SessionSearchResult>(
`SELECT s.id as session_id, s.session_number, s.project, s.summary, s.started_at,
1 - (s.embedding <=> $1) as similarity
FROM sessions s
WHERE s.embedding IS NOT NULL AND s.status = 'completed'${filter.where}
ORDER BY s.embedding <=> $1
LIMIT $2`,
params
);
vectorIds = rows.map(r => r.session_id);
for (const r of rows) vectorRows.set(r.session_id, r);
} else {
embeddingFailed = true;
if (search_mode === 'vector') {
return [];
}
}
}
const embeddingFormatted = formatEmbedding(queryEmbedding);
// Keyword search
let keywordIds: string[] = [];
let keywordRows: Map<string, SessionSearchResult> = new Map();
// Vector similarity search
let sql = `
SELECT
s.id as session_id,
s.session_number,
s.project,
s.summary,
s.started_at,
1 - (s.embedding <=> $1) as similarity
FROM sessions s
WHERE s.embedding IS NOT NULL
${project ? 'AND s.project = $2' : ''}
AND s.status = 'completed'
ORDER BY s.embedding <=> $1
LIMIT $${project ? '3' : '2'}
`;
if (search_mode !== 'vector') {
const filter = buildFilter(3);
const params: unknown[] = [searchQuery, limit, ...filter.params];
const params: unknown[] = project ? [embeddingFormatted, project, limit] : [embeddingFormatted, limit];
const results = await query<SessionSearchResult>(sql, params);
const rows = await query<SessionSearchResult & { rank: number }>(
`SELECT s.id as session_id, s.session_number, s.project, s.summary, s.started_at,
ts_rank(s.search_vector, plainto_tsquery('english', $1)) as similarity
FROM sessions s
WHERE s.search_vector @@ plainto_tsquery('english', $1)
AND s.status = 'completed'${filter.where}
ORDER BY similarity DESC
LIMIT $2`,
params
);
keywordIds = rows.map(r => r.session_id);
for (const r of rows) keywordRows.set(r.session_id, r);
}
// Merge results
let finalIds: string[];
if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) {
const merged = rrfMerge(vectorIds, keywordIds);
finalIds = merged.slice(0, limit).map(m => m.id as string);
} else if (vectorIds.length > 0) {
finalIds = vectorIds;
} else if (keywordIds.length > 0) {
finalIds = keywordIds;
} else {
return [];
}
// Build final results preserving original similarity scores
const results: SessionSearchResult[] = [];
for (const id of finalIds) {
const r = vectorRows.get(id) || keywordRows.get(id);
if (r) results.push(r);
}
return results;
}