feat(CF-1314): Content hashing to prevent duplicate embeddings

SHA-256 hash check before embedding API call eliminates ~60-80% of
redundant embedding requests. Consolidates dual INSERT paths to single
INSERT with nullable embedding column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 08:28:11 +02:00
parent 77097ac65f
commit 1f499bd926
6 changed files with 127 additions and 81 deletions

View File

@@ -1,7 +1,7 @@
// Session memory operations for persistent learnings
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
type MemoryCategory = 'pattern' | 'fix' | 'preference' | 'gotcha' | 'architecture';
@@ -61,24 +61,33 @@ export async function memoryAdd(args: MemoryAddArgs): Promise<string> {
}
}
// Generate embedding for semantic search
// CF-1314: Hash content for dedup before embedding API call
const embedText = `${title}. ${content}`;
const contentHash = generateContentHash(embedText);
// Scope dedup to project if provided, otherwise global
const existing = project
? await queryOne<{ id: number }>(
'SELECT id FROM memories WHERE content_hash = $1 AND project = $2 LIMIT 1',
[contentHash, project]
)
: await queryOne<{ id: number }>(
'SELECT id FROM memories WHERE content_hash = $1 AND project IS NULL LIMIT 1',
[contentHash]
);
if (existing) {
return `Memory already exists (id: ${existing.id}): [${category}] ${title}`;
}
// Generate embedding for semantic search
const embedding = await getEmbedding(embedText);
const embeddingValue = embedding ? formatEmbedding(embedding) : null;
if (embeddingValue) {
await execute(
`INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue]
);
} else {
await execute(
`INSERT INTO memories (category, title, content, context, project, session_id, task_id)
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null]
);
}
await execute(
`INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue, contentHash]
);
return `Stored memory: [${category}] ${title}`;
}