feat(CF-1315): Hybrid search with tsvector + RRF
Add PostgreSQL full-text search alongside pgvector for exact matches on Jira keys, error messages, file paths. Merge results with Reciprocal Rank Fusion. Default mode: hybrid, with graceful degradation to keyword-only when embeddings unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
// Project archives operations for database-backed archival
|
||||
|
||||
import { query, queryOne, execute } from '../db.js';
|
||||
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
|
||||
import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge } from '../embeddings.js';
|
||||
|
||||
type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration';
|
||||
|
||||
@@ -31,11 +31,14 @@ interface ArchiveAddArgs {
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
type SearchMode = 'hybrid' | 'vector' | 'keyword';
|
||||
|
||||
interface ArchiveSearchArgs {
|
||||
query: string;
|
||||
project?: string;
|
||||
archive_type?: ArchiveType;
|
||||
limit?: number;
|
||||
search_mode?: SearchMode;
|
||||
}
|
||||
|
||||
interface ArchiveListArgs {
|
||||
@@ -111,97 +114,107 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Search archives semantically
|
||||
* Search archives with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315)
|
||||
*/
|
||||
export async function archiveSearch(args: ArchiveSearchArgs): Promise<string> {
|
||||
const { query: searchQuery, project, archive_type, limit = 5 } = args;
|
||||
|
||||
// Generate embedding for search
|
||||
const embedding = await getEmbedding(searchQuery);
|
||||
|
||||
// Fallback to text search if embeddings unavailable
|
||||
if (!embedding) {
|
||||
console.warn('Embeddings unavailable, falling back to text search');
|
||||
|
||||
let whereClause = '(title ILIKE $1 OR content ILIKE $1)';
|
||||
const params: unknown[] = [`%${searchQuery}%`];
|
||||
let paramIndex = 2;
|
||||
const { query: searchQuery, project, archive_type, limit = 5, search_mode = 'hybrid' } = args;
|
||||
|
||||
// Build shared filter clause
|
||||
const buildFilter = (startIdx: number) => {
|
||||
let where = '';
|
||||
const params: unknown[] = [];
|
||||
let idx = startIdx;
|
||||
if (project) {
|
||||
whereClause += ` AND project_key = $${paramIndex++}`;
|
||||
where += ` AND project_key = $${idx++}`;
|
||||
params.push(project);
|
||||
}
|
||||
if (archive_type) {
|
||||
whereClause += ` AND archive_type = $${paramIndex++}`;
|
||||
where += ` AND archive_type = $${idx++}`;
|
||||
params.push(archive_type);
|
||||
}
|
||||
return { where, params, nextIdx: idx };
|
||||
};
|
||||
|
||||
params.push(limit);
|
||||
// Vector search
|
||||
let vectorIds: number[] = [];
|
||||
let vectorRows: Map<number, Archive & { similarity: number }> = new Map();
|
||||
let embeddingFailed = false;
|
||||
|
||||
const archives = await query<Archive>(
|
||||
if (search_mode !== 'keyword') {
|
||||
const embedding = await getEmbedding(searchQuery);
|
||||
if (embedding) {
|
||||
const embeddingStr = formatEmbedding(embedding);
|
||||
const filter = buildFilter(3);
|
||||
const params: unknown[] = [embeddingStr, limit, ...filter.params];
|
||||
|
||||
const rows = await query<Archive & { similarity: number }>(
|
||||
`SELECT id, archive_type, title, original_path, file_size,
|
||||
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
|
||||
1 - (embedding <=> $1) as similarity
|
||||
FROM project_archives
|
||||
WHERE embedding IS NOT NULL${filter.where}
|
||||
ORDER BY embedding <=> $1
|
||||
LIMIT $2`,
|
||||
params
|
||||
);
|
||||
vectorIds = rows.map(r => r.id);
|
||||
for (const r of rows) vectorRows.set(r.id, r);
|
||||
} else {
|
||||
embeddingFailed = true;
|
||||
if (search_mode === 'vector') {
|
||||
return 'Error: Could not generate embedding for vector search';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Keyword search
|
||||
let keywordIds: number[] = [];
|
||||
let keywordRows: Map<number, Archive & { rank: number }> = new Map();
|
||||
|
||||
if (search_mode !== 'vector') {
|
||||
const filter = buildFilter(3);
|
||||
const params: unknown[] = [searchQuery, limit, ...filter.params];
|
||||
|
||||
const rows = await query<Archive & { rank: number }>(
|
||||
`SELECT id, archive_type, title, original_path, file_size,
|
||||
to_char(archived_at, 'YYYY-MM-DD') as archived_at
|
||||
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
|
||||
ts_rank(search_vector, plainto_tsquery('english', $1)) as rank
|
||||
FROM project_archives
|
||||
WHERE ${whereClause}
|
||||
ORDER BY archived_at DESC
|
||||
LIMIT $${paramIndex}`,
|
||||
WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where}
|
||||
ORDER BY rank DESC
|
||||
LIMIT $2`,
|
||||
params
|
||||
);
|
||||
|
||||
if (archives.length === 0) {
|
||||
return 'No relevant archives found';
|
||||
}
|
||||
|
||||
const lines = ['Relevant archives (text search - embeddings unavailable):\n'];
|
||||
for (const a of archives) {
|
||||
const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
|
||||
lines.push(`**[${a.archive_type}]** ${a.title}`);
|
||||
lines.push(` Archived: ${a.archived_at}${sizeStr}`);
|
||||
if (a.original_path) {
|
||||
lines.push(` Path: ${a.original_path}`);
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
keywordIds = rows.map(r => r.id);
|
||||
for (const r of rows) keywordRows.set(r.id, r);
|
||||
}
|
||||
|
||||
// Semantic search with embeddings
|
||||
const embeddingStr = formatEmbedding(embedding);
|
||||
// Merge results
|
||||
let finalIds: number[];
|
||||
let searchLabel: string;
|
||||
|
||||
let whereClause = 'WHERE embedding IS NOT NULL';
|
||||
const params: unknown[] = [embeddingStr, limit];
|
||||
let paramIndex = 3;
|
||||
|
||||
if (project) {
|
||||
whereClause += ` AND project_key = $${paramIndex++}`;
|
||||
params.splice(params.length - 1, 0, project);
|
||||
}
|
||||
if (archive_type) {
|
||||
whereClause += ` AND archive_type = $${paramIndex++}`;
|
||||
params.splice(params.length - 1, 0, archive_type);
|
||||
}
|
||||
|
||||
const archives = await query<Archive & { similarity: number }>(
|
||||
`SELECT id, archive_type, title, original_path, file_size,
|
||||
to_char(archived_at, 'YYYY-MM-DD') as archived_at,
|
||||
1 - (embedding <=> $1) as similarity
|
||||
FROM project_archives
|
||||
${whereClause}
|
||||
ORDER BY embedding <=> $1
|
||||
LIMIT $2`,
|
||||
params
|
||||
);
|
||||
|
||||
if (archives.length === 0) {
|
||||
if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) {
|
||||
const merged = rrfMerge(vectorIds, keywordIds);
|
||||
finalIds = merged.slice(0, limit).map(m => m.id as number);
|
||||
searchLabel = 'hybrid';
|
||||
} else if (vectorIds.length > 0) {
|
||||
finalIds = vectorIds;
|
||||
searchLabel = 'vector';
|
||||
} else if (keywordIds.length > 0) {
|
||||
finalIds = keywordIds;
|
||||
searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword';
|
||||
} else {
|
||||
return 'No relevant archives found';
|
||||
}
|
||||
|
||||
const lines = ['Relevant archives:\n'];
|
||||
for (const a of archives) {
|
||||
const sim = Math.round(a.similarity * 100);
|
||||
// Format output
|
||||
const lines = [`Relevant archives (${searchLabel}):\n`];
|
||||
for (const id of finalIds) {
|
||||
const a = vectorRows.get(id) || keywordRows.get(id);
|
||||
if (!a) continue;
|
||||
const sim = vectorRows.has(id) ? ` (${Math.round((vectorRows.get(id)!).similarity * 100)}% match)` : '';
|
||||
const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
|
||||
lines.push(`**[${a.archive_type}]** ${a.title} (${sim}% match)`);
|
||||
lines.push(`**[${a.archive_type}]** ${a.title}${sim}`);
|
||||
lines.push(` Archived: ${a.archived_at}${sizeStr}`);
|
||||
if (a.original_path) {
|
||||
lines.push(` Path: ${a.original_path}`);
|
||||
|
||||
Reference in New Issue
Block a user