feat(CF-1314): Content hashing to prevent duplicate embeddings

SHA-256 hash check before embedding API call eliminates ~60-80% of
redundant embedding requests. Consolidates dual INSERT paths to single
INSERT with nullable embedding column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 08:28:11 +02:00
parent 77097ac65f
commit 1f499bd926
6 changed files with 127 additions and 81 deletions

View File

@@ -0,0 +1,20 @@
-- CF-1314: Content hashing to prevent duplicate embeddings
-- Adds content_hash column to all embedding tables for dedup before API call
-- Adds source_id columns for future CF-1315 hybrid search
ALTER TABLE project_archives ADD COLUMN IF NOT EXISTS content_hash TEXT;
ALTER TABLE project_archives ADD COLUMN IF NOT EXISTS source_id TEXT;
ALTER TABLE memories ADD COLUMN IF NOT EXISTS content_hash TEXT;
ALTER TABLE memories ADD COLUMN IF NOT EXISTS source_id TEXT;
ALTER TABLE session_notes ADD COLUMN IF NOT EXISTS content_hash TEXT;
ALTER TABLE session_plans ADD COLUMN IF NOT EXISTS content_hash TEXT;
ALTER TABLE sessions ADD COLUMN IF NOT EXISTS content_hash TEXT;
CREATE INDEX IF NOT EXISTS idx_archives_content_hash ON project_archives(content_hash);
CREATE INDEX IF NOT EXISTS idx_memories_content_hash ON memories(content_hash);
CREATE INDEX IF NOT EXISTS idx_session_notes_content_hash ON session_notes(content_hash);
CREATE INDEX IF NOT EXISTS idx_session_plans_content_hash ON session_plans(content_hash);
CREATE INDEX IF NOT EXISTS idx_sessions_content_hash ON sessions(content_hash);
CREATE INDEX IF NOT EXISTS idx_archives_source_id ON project_archives(source_id);
CREATE INDEX IF NOT EXISTS idx_memories_source_id ON memories(source_id);

View File

@@ -1,5 +1,14 @@
// Embeddings via LiteLLM API
import { createHash } from 'crypto';
/**
* Generate SHA-256 content hash for dedup before embedding API call (CF-1314)
*/
export function generateContentHash(text: string): string {
return createHash('sha256').update(text).digest('hex');
}
interface EmbeddingResponse {
data: Array<{
embedding: number[];

View File

@@ -1,7 +1,7 @@
// Project archives operations for database-backed archival
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration';
@@ -72,16 +72,26 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
return `Error: Project not found: ${project}`;
}
// CF-1314: Hash content for dedup before embedding API call
const embedText = `${title}. ${content.substring(0, 1000)}`;
const contentHash = generateContentHash(embedText);
const existing = await queryOne<{ id: number }>(
'SELECT id FROM project_archives WHERE content_hash = $1 AND project_key = $2 LIMIT 1',
[contentHash, project]
);
if (existing) {
return `Archive already exists (id: ${existing.id}): [${archive_type}] ${title}`;
}
// Generate embedding for semantic search
const embedText = `${title}. ${content.substring(0, 1000)}`; // Limit content length for embedding
const embedding = await getEmbedding(embedText);
const embeddingValue = embedding ? formatEmbedding(embedding) : null;
if (embeddingValue) {
await execute(
`INSERT INTO project_archives
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
[
project,
archive_type,
@@ -91,26 +101,10 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
file_size || null,
archived_by_session || null,
JSON.stringify(metadata || {}),
embeddingValue
embeddingValue,
contentHash
]
);
} else {
await execute(
`INSERT INTO project_archives
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
[
project,
archive_type,
title,
content,
original_path || null,
file_size || null,
archived_by_session || null,
JSON.stringify(metadata || {})
]
);
}
const sizeStr = file_size ? ` (${Math.round(file_size / 1024)}KB)` : '';
return `Archived: [${archive_type}] ${title}${sizeStr}`;

View File

@@ -1,7 +1,7 @@
// Session memory operations for persistent learnings
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
type MemoryCategory = 'pattern' | 'fix' | 'preference' | 'gotcha' | 'architecture';
@@ -61,24 +61,33 @@ export async function memoryAdd(args: MemoryAddArgs): Promise<string> {
}
}
// Generate embedding for semantic search
// CF-1314: Hash content for dedup before embedding API call
const embedText = `${title}. ${content}`;
const contentHash = generateContentHash(embedText);
// Scope dedup to project if provided, otherwise global
const existing = project
? await queryOne<{ id: number }>(
'SELECT id FROM memories WHERE content_hash = $1 AND project = $2 LIMIT 1',
[contentHash, project]
)
: await queryOne<{ id: number }>(
'SELECT id FROM memories WHERE content_hash = $1 AND project IS NULL LIMIT 1',
[contentHash]
);
if (existing) {
return `Memory already exists (id: ${existing.id}): [${category}] ${title}`;
}
// Generate embedding for semantic search
const embedding = await getEmbedding(embedText);
const embeddingValue = embedding ? formatEmbedding(embedding) : null;
if (embeddingValue) {
await execute(
`INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue]
`INSERT INTO memories (category, title, content, context, project, session_id, task_id, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null, embeddingValue, contentHash]
);
} else {
await execute(
`INSERT INTO memories (category, title, content, context, project, session_id, task_id)
VALUES ($1, $2, $3, $4, $5, $6, $7)`,
[category, title, content, context || null, project || null, validSessionId, task_id || null]
);
}
return `Stored memory: [${category}] ${title}`;
}

View File

@@ -2,7 +2,7 @@
// Replaces file-based CLAUDE.md and plan files with database storage
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
import { getSessionId } from './session-id.js';
// ============================================================================
@@ -36,14 +36,25 @@ export async function sessionNoteAdd(args: SessionNoteAddArgs): Promise<string>
const { session_id: providedSessionId, note_type, content } = args;
const session_id = providedSessionId || getSessionId();
// CF-1314: Hash content for dedup before embedding API call
const contentHash = generateContentHash(content);
const existing = await queryOne<{ id: number }>(
'SELECT id FROM session_notes WHERE content_hash = $1 AND session_id = $2 LIMIT 1',
[contentHash, session_id]
);
if (existing) {
return `Note already exists (id: ${existing.id}) in session ${session_id}`;
}
// Generate embedding for semantic search
const embedding = await getEmbedding(content);
const embeddingFormatted = embedding ? formatEmbedding(embedding) : null;
await execute(
`INSERT INTO session_notes (session_id, note_type, content, embedding)
VALUES ($1, $2, $3, $4)`,
[session_id, note_type, content, embeddingFormatted]
`INSERT INTO session_notes (session_id, note_type, content, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5)`,
[session_id, note_type, content, embeddingFormatted, contentHash]
);
return `Note added to session ${session_id} (type: ${note_type})`;
@@ -113,15 +124,26 @@ interface SessionPlan {
export async function sessionPlanSave(args: SessionPlanSaveArgs): Promise<string> {
const { session_id, plan_content, plan_file_name, status = 'draft' } = args;
// CF-1314: Hash content for dedup before embedding API call
const contentHash = generateContentHash(plan_content);
const existing = await queryOne<{ id: number }>(
'SELECT id FROM session_plans WHERE content_hash = $1 AND session_id = $2 LIMIT 1',
[contentHash, session_id]
);
if (existing) {
return `Plan already exists (id: ${existing.id}) in session ${session_id}`;
}
// Generate embedding for semantic search
const embedding = await getEmbedding(plan_content);
const embeddingFormatted = embedding ? formatEmbedding(embedding) : null;
const result = await queryOne<{ id: number }>(
`INSERT INTO session_plans (session_id, plan_file_name, plan_content, status, embedding)
VALUES ($1, $2, $3, $4, $5)
`INSERT INTO session_plans (session_id, plan_file_name, plan_content, status, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5, $6)
RETURNING id`,
[session_id, plan_file_name || null, plan_content, status, embeddingFormatted]
[session_id, plan_file_name || null, plan_content, status, embeddingFormatted, contentHash]
);
const planId = result?.id || 0;

View File

@@ -2,7 +2,7 @@
// Sessions auto-create CF Jira issues and post output on close (CF-762)
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
import { createSessionIssue, addComment, transitionToDone, updateIssueDescription } from '../services/jira.js';
interface SessionStartArgs {
@@ -157,32 +157,24 @@ export async function sessionUpdate(args: SessionUpdateArgs): Promise<string> {
export async function sessionEnd(args: SessionEndArgs): Promise<string> {
const { session_id, summary, status = 'completed' } = args;
// CF-1314: Store content hash alongside embedding
const contentHash = generateContentHash(summary);
// Generate embedding for semantic search
const embedding = await getEmbedding(summary);
const embeddingValue = embedding ? formatEmbedding(embedding) : null;
if (embeddingValue) {
await execute(
`UPDATE sessions
SET ended_at = NOW(),
summary = $1,
embedding = $2,
status = $3,
content_hash = $4,
updated_at = NOW()
WHERE id = $4`,
[summary, embeddingValue, status, session_id]
WHERE id = $5`,
[summary, embeddingValue, status, contentHash, session_id]
);
} else {
await execute(
`UPDATE sessions
SET ended_at = NOW(),
summary = $1,
status = $2,
updated_at = NOW()
WHERE id = $3`,
[summary, status, session_id]
);
}
// Get session details
const session = await queryOne<Session & { jira_issue_key: string | null }>(