diff --git a/migrations/033_content_hash_dedup.sql b/migrations/033_content_hash_dedup.sql new file mode 100644 index 0000000..bd91c6c --- /dev/null +++ b/migrations/033_content_hash_dedup.sql @@ -0,0 +1,20 @@ +-- CF-1314: Content hashing to prevent duplicate embeddings +-- Adds content_hash column to all embedding tables for dedup before API call +-- Adds source_id columns for future CF-1315 hybrid search + +ALTER TABLE project_archives ADD COLUMN IF NOT EXISTS content_hash TEXT; +ALTER TABLE project_archives ADD COLUMN IF NOT EXISTS source_id TEXT; +ALTER TABLE memories ADD COLUMN IF NOT EXISTS content_hash TEXT; +ALTER TABLE memories ADD COLUMN IF NOT EXISTS source_id TEXT; +ALTER TABLE session_notes ADD COLUMN IF NOT EXISTS content_hash TEXT; +ALTER TABLE session_plans ADD COLUMN IF NOT EXISTS content_hash TEXT; +ALTER TABLE sessions ADD COLUMN IF NOT EXISTS content_hash TEXT; + +CREATE INDEX IF NOT EXISTS idx_archives_content_hash ON project_archives(content_hash); +CREATE INDEX IF NOT EXISTS idx_memories_content_hash ON memories(content_hash); +CREATE INDEX IF NOT EXISTS idx_session_notes_content_hash ON session_notes(content_hash); +CREATE INDEX IF NOT EXISTS idx_session_plans_content_hash ON session_plans(content_hash); +CREATE INDEX IF NOT EXISTS idx_sessions_content_hash ON sessions(content_hash); + +CREATE INDEX IF NOT EXISTS idx_archives_source_id ON project_archives(source_id); +CREATE INDEX IF NOT EXISTS idx_memories_source_id ON memories(source_id); diff --git a/src/embeddings.ts b/src/embeddings.ts index 6eeb3c4..083e802 100644 --- a/src/embeddings.ts +++ b/src/embeddings.ts @@ -1,5 +1,14 @@ // Embeddings via LiteLLM API +import { createHash } from 'crypto'; + +/** + * Generate SHA-256 content hash for dedup before embedding API call (CF-1314) + */ +export function generateContentHash(text: string): string { + return createHash('sha256').update(text).digest('hex'); +} + interface EmbeddingResponse { data: Array<{ embedding: number[]; diff --git a/src/tools/archives.ts b/src/tools/archives.ts index 20ae090..43e63d0 100644 --- a/src/tools/archives.ts +++ b/src/tools/archives.ts @@ -1,7 +1,7 @@ // Project archives operations for database-backed archival import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration'; @@ -72,45 +72,39 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise { return `Error: Project not found: ${project}`; } + // CF-1314: Hash content for dedup before embedding API call + const embedText = `${title}. ${content.substring(0, 1000)}`; + const contentHash = generateContentHash(embedText); + + const existing = await queryOne<{ id: number }>( + 'SELECT id FROM project_archives WHERE content_hash = $1 AND project_key = $2 LIMIT 1', + [contentHash, project] + ); + if (existing) { + return `Archive already exists (id: ${existing.id}): [${archive_type}] ${title}`; + } + // Generate embedding for semantic search - const embedText = `${title}. ${content.substring(0, 1000)}`; // Limit content length for embedding const embedding = await getEmbedding(embedText); const embeddingValue = embedding ? formatEmbedding(embedding) : null; - if (embeddingValue) { - await execute( - `INSERT INTO project_archives - (project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, - [ - project, - archive_type, - title, - content, - original_path || null, - file_size || null, - archived_by_session || null, - JSON.stringify(metadata || {}), - embeddingValue - ] - ); - } else { - await execute( - `INSERT INTO project_archives - (project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, - [ - project, - archive_type, - title, - content, - original_path || null, - file_size || null, - archived_by_session || null, - JSON.stringify(metadata || {}) - ] - ); - } + await execute( + `INSERT INTO project_archives + (project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding, content_hash) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`, + [ + project, + archive_type, + title, + content, + original_path || null, + file_size || null, + archived_by_session || null, + JSON.stringify(metadata || {}), + embeddingValue, + contentHash + ] + ); const sizeStr = file_size ? ` (${Math.round(file_size / 1024)}KB)` : ''; return `Archived: [${archive_type}] ${title}${sizeStr}`; diff --git a/src/tools/memories.ts b/src/tools/memories.ts index 4484bed..683e684 100644 --- a/src/tools/memories.ts +++ b/src/tools/memories.ts @@ -1,7 +1,7 @@ // Session memory operations for persistent learnings import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; type MemoryCategory = 'pattern' | 'fix' | 'preference' | 'gotcha' | 'architecture'; @@ -61,24 +61,33 @@ export async function memoryAdd(args: MemoryAddArgs): Promise { } } - // Generate embedding for semantic search + // CF-1314: Hash content for dedup before embedding API call const embedText = `${title}. ${content}`; + const contentHash = generateContentHash(embedText); + + // Scope dedup to project if provided, otherwise global + const existing = project + ? await queryOne<{ id: number }>( + 'SELECT id FROM memories WHERE content_hash = $1 AND project = $2 LIMIT 1', + [contentHash, project] + ) + : await queryOne<{ id: number }>( + 'SELECT id FROM memories WHERE content_hash = $1 AND project IS NULL LIMIT 1', + [contentHash] + ); + if (existing) { + return `Memory already exists (id: ${existing.id}): [${category}] ${title}`; + } + + // Generate embedding for semantic search const embedding = await getEmbedding(embedText); const embeddingValue = embedding ? formatEmbedding(embedding) : null; - if (embeddingValue) { - await execute( - `INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, - [category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue] - ); - } else { - await execute( - `INSERT INTO memories (category, title, content, context, project, session_id, task_id) - VALUES ($1, $2, $3, $4, $5, $6, $7)`, - [category, title, content, context || null, project || null, validSessionId, task_id || null] - ); - } + await execute( + `INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding, content_hash) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + [category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue, contentHash] + ); return `Stored memory: [${category}] ${title}`; } diff --git a/src/tools/session-docs.ts b/src/tools/session-docs.ts index ffad26b..8d88209 100644 --- a/src/tools/session-docs.ts +++ b/src/tools/session-docs.ts @@ -2,7 +2,7 @@ // Replaces file-based CLAUDE.md and plan files with database storage import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; import { getSessionId } from './session-id.js'; // ============================================================================ @@ -36,14 +36,25 @@ export async function sessionNoteAdd(args: SessionNoteAddArgs): Promise const { session_id: providedSessionId, note_type, content } = args; const session_id = providedSessionId || getSessionId(); + // CF-1314: Hash content for dedup before embedding API call + const contentHash = generateContentHash(content); + + const existing = await queryOne<{ id: number }>( + 'SELECT id FROM session_notes WHERE content_hash = $1 AND session_id = $2 LIMIT 1', + [contentHash, session_id] + ); + if (existing) { + return `Note already exists (id: ${existing.id}) in session ${session_id}`; + } + // Generate embedding for semantic search const embedding = await getEmbedding(content); const embeddingFormatted = embedding ? formatEmbedding(embedding) : null; await execute( - `INSERT INTO session_notes (session_id, note_type, content, embedding) - VALUES ($1, $2, $3, $4)`, - [session_id, note_type, content, embeddingFormatted] + `INSERT INTO session_notes (session_id, note_type, content, embedding, content_hash) + VALUES ($1, $2, $3, $4, $5)`, + [session_id, note_type, content, embeddingFormatted, contentHash] ); return `Note added to session ${session_id} (type: ${note_type})`; @@ -113,15 +124,26 @@ interface SessionPlan { export async function sessionPlanSave(args: SessionPlanSaveArgs): Promise { const { session_id, plan_content, plan_file_name, status = 'draft' } = args; + // CF-1314: Hash content for dedup before embedding API call + const contentHash = generateContentHash(plan_content); + + const existing = await queryOne<{ id: number }>( + 'SELECT id FROM session_plans WHERE content_hash = $1 AND session_id = $2 LIMIT 1', + [contentHash, session_id] + ); + if (existing) { + return `Plan already exists (id: ${existing.id}) in session ${session_id}`; + } + // Generate embedding for semantic search const embedding = await getEmbedding(plan_content); const embeddingFormatted = embedding ? formatEmbedding(embedding) : null; const result = await queryOne<{ id: number }>( - `INSERT INTO session_plans (session_id, plan_file_name, plan_content, status, embedding) - VALUES ($1, $2, $3, $4, $5) + `INSERT INTO session_plans (session_id, plan_file_name, plan_content, status, embedding, content_hash) + VALUES ($1, $2, $3, $4, $5, $6) RETURNING id`, - [session_id, plan_file_name || null, plan_content, status, embeddingFormatted] + [session_id, plan_file_name || null, plan_content, status, embeddingFormatted, contentHash] ); const planId = result?.id || 0; diff --git a/src/tools/sessions.ts b/src/tools/sessions.ts index 43abe32..2947b2c 100644 --- a/src/tools/sessions.ts +++ b/src/tools/sessions.ts @@ -2,7 +2,7 @@ // Sessions auto-create CF Jira issues and post output on close (CF-762) import { query, queryOne, execute } from '../db.js'; -import { getEmbedding, formatEmbedding } from '../embeddings.js'; +import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js'; import { createSessionIssue, addComment, transitionToDone, updateIssueDescription } from '../services/jira.js'; interface SessionStartArgs { @@ -157,32 +157,24 @@ export async function sessionUpdate(args: SessionUpdateArgs): Promise { export async function sessionEnd(args: SessionEndArgs): Promise { const { session_id, summary, status = 'completed' } = args; + // CF-1314: Store content hash alongside embedding + const contentHash = generateContentHash(summary); + // Generate embedding for semantic search const embedding = await getEmbedding(summary); const embeddingValue = embedding ? formatEmbedding(embedding) : null; - if (embeddingValue) { - await execute( - `UPDATE sessions - SET ended_at = NOW(), - summary = $1, - embedding = $2, - status = $3, - updated_at = NOW() - WHERE id = $4`, - [summary, embeddingValue, status, session_id] - ); - } else { - await execute( - `UPDATE sessions - SET ended_at = NOW(), - summary = $1, - status = $2, - updated_at = NOW() - WHERE id = $3`, - [summary, status, session_id] - ); - } + await execute( + `UPDATE sessions + SET ended_at = NOW(), + summary = $1, + embedding = $2, + status = $3, + content_hash = $4, + updated_at = NOW() + WHERE id = $5`, + [summary, embeddingValue, status, contentHash, session_id] + ); // Get session details const session = await queryOne(