feat(CF-1315): Hybrid search with tsvector + RRF

Add PostgreSQL full-text search alongside pgvector for exact matches
on Jira keys, error messages, file paths. Merge results with
Reciprocal Rank Fusion. Default mode: hybrid, with graceful
degradation to keyword-only when embeddings unavailable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 08:46:39 +02:00
parent 1f499bd926
commit 4f8996cd82
8 changed files with 434 additions and 183 deletions

View File

@@ -1,7 +1,7 @@
// Project archives operations for database-backed archival
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js';
type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration';
@@ -31,11 +31,14 @@ interface ArchiveAddArgs {
metadata?: Record<string, unknown>;
}
type SearchMode = 'hybrid' | 'vector' | 'keyword';
interface ArchiveSearchArgs {
query: string;
project?: string;
archive_type?: ArchiveType;
limit?: number;
search_mode?: SearchMode;
}
interface ArchiveListArgs {
@@ -111,97 +114,107 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
}
/**
* Search archives semantically
* Search archives with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315)
*/
export async function archiveSearch(args: ArchiveSearchArgs): Promise<string> {
const { query: searchQuery, project, archive_type, limit = 5 } = args;
// Generate embedding for search
const embedding = await getEmbedding(searchQuery);
// Fallback to text search if embeddings unavailable
if (!embedding) {
console.warn('Embeddings unavailable, falling back to text search');
let whereClause = '(title ILIKE $1 OR content ILIKE $1)';
const params: unknown[] = [`%${searchQuery}%`];
let paramIndex = 2;
const { query: searchQuery, project, archive_type, limit = 5, search_mode = 'hybrid' } = args;
// Build shared filter clause
const buildFilter = (startIdx: number) => {
let where = '';
const params: unknown[] = [];
let idx = startIdx;
if (project) {
whereClause += ` AND project_key = $${paramIndex++}`;
where += ` AND project_key = $${idx++}`;
params.push(project);
}
if (archive_type) {
whereClause += ` AND archive_type = $${paramIndex++}`;
where += ` AND archive_type = $${idx++}`;
params.push(archive_type);
}
return { where, params, nextIdx: idx };
};
params.push(limit);
// Vector search
let vectorIds: number[] = [];
let vectorRows: Map<number, Archive & { similarity: number }> = new Map();
let embeddingFailed = false;
const archives = await query<Archive>(
if (search_mode !== 'keyword') {
const embedding = await getEmbedding(searchQuery);
if (embedding) {
const embeddingStr = formatEmbedding(embedding);
const filter = buildFilter(3);
const params: unknown[] = [embeddingStr, limit, ...filter.params];
const rows = await query<Archive & { similarity: number }>(
`SELECT id, archive_type, title, original_path, file_size,
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
1 - (embedding <=> $1) as similarity
FROM project_archives
WHERE embedding IS NOT NULL${filter.where}
ORDER BY embedding <=> $1
LIMIT $2`,
params
);
vectorIds = rows.map(r => r.id);
for (const r of rows) vectorRows.set(r.id, r);
} else {
embeddingFailed = true;
if (search_mode === 'vector') {
return 'Error: Could not generate embedding for vector search';
}
}
}
// Keyword search
let keywordIds: number[] = [];
let keywordRows: Map<number, Archive & { rank: number }> = new Map();
if (search_mode !== 'vector') {
const filter = buildFilter(3);
const params: unknown[] = [searchQuery, limit, ...filter.params];
const rows = await query<Archive & { rank: number }>(
`SELECT id, archive_type, title, original_path, file_size,
to_char(archived_at, 'YYYY-MM-DD') as archived_at
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
ts_rank(search_vector, plainto_tsquery('english', $1)) as rank
FROM project_archives
WHERE ${whereClause}
ORDER BY archived_at DESC
LIMIT $${paramIndex}`,
WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where}
ORDER BY rank DESC
LIMIT $2`,
params
);
if (archives.length === 0) {
return 'No relevant archives found';
}
const lines = ['Relevant archives (text search - embeddings unavailable):\n'];
for (const a of archives) {
const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
lines.push(`**[${a.archive_type}]** ${a.title}`);
lines.push(` Archived: ${a.archived_at}${sizeStr}`);
if (a.original_path) {
lines.push(` Path: ${a.original_path}`);
}
lines.push('');
}
return lines.join('\n');
keywordIds = rows.map(r => r.id);
for (const r of rows) keywordRows.set(r.id, r);
}
// Semantic search with embeddings
const embeddingStr = formatEmbedding(embedding);
// Merge results
let finalIds: number[];
let searchLabel: string;
let whereClause = 'WHERE embedding IS NOT NULL';
const params: unknown[] = [embeddingStr, limit];
let paramIndex = 3;
if (project) {
whereClause += ` AND project_key = $${paramIndex++}`;
params.splice(params.length - 1, 0, project);
}
if (archive_type) {
whereClause += ` AND archive_type = $${paramIndex++}`;
params.splice(params.length - 1, 0, archive_type);
}
const archives = await query<Archive & { similarity: number }>(
`SELECT id, archive_type, title, original_path, file_size,
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
1 - (embedding <=> $1) as similarity
FROM project_archives
${whereClause}
ORDER BY embedding <=> $1
LIMIT $2`,
params
);
if (archives.length === 0) {
if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) {
const merged = rrfMerge(vectorIds, keywordIds);
finalIds = merged.slice(0, limit).map(m => m.id as number);
searchLabel = 'hybrid';
} else if (vectorIds.length > 0) {
finalIds = vectorIds;
searchLabel = 'vector';
} else if (keywordIds.length > 0) {
finalIds = keywordIds;
searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword';
} else {
return 'No relevant archives found';
}
const lines = ['Relevant archives:\n'];
for (const a of archives) {
const sim = Math.round(a.similarity * 100);
// Format output
const lines = [`Relevant archives (${searchLabel}):\n`];
for (const id of finalIds) {
const a = vectorRows.get(id) || keywordRows.get(id);
if (!a) continue;
const sim = vectorRows.has(id) ? ` (${Math.round((vectorRows.get(id)!).similarity * 100)}% match)` : '';
const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
lines.push(`**[${a.archive_type}]** ${a.title} (${sim}% match)`);
lines.push(`**[${a.archive_type}]** ${a.title}${sim}`);
lines.push(` Archived: ${a.archived_at}${sizeStr}`);
if (a.original_path) {
lines.push(` Path: ${a.original_path}`);