feat(CF-1314): Content hashing to prevent duplicate embeddings

SHA-256 hash check before embedding API call eliminates ~60-80% of
redundant embedding requests. Consolidates dual INSERT paths to single
INSERT with nullable embedding column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-18 08:28:11 +02:00
parent 77097ac65f
commit 1f499bd926
6 changed files with 127 additions and 81 deletions

View File

@@ -1,7 +1,7 @@
// Project archives operations for database-backed archival
import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding } from '../embeddings.js';
import { getEmbedding, formatEmbedding, generateContentHash } from '../embeddings.js';
type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration';
@@ -72,45 +72,39 @@ export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
return `Error: Project not found: ${project}`;
}
// CF-1314: Hash content for dedup before embedding API call
const embedText = `${title}. ${content.substring(0, 1000)}`;
const contentHash = generateContentHash(embedText);
const existing = await queryOne<{ id: number }>(
'SELECT id FROM project_archives WHERE content_hash = $1 AND project_key = $2 LIMIT 1',
[contentHash, project]
);
if (existing) {
return `Archive already exists (id: ${existing.id}): [${archive_type}] ${title}`;
}
// Generate embedding for semantic search
const embedText = `${title}. ${content.substring(0, 1000)}`; // Limit content length for embedding
const embedding = await getEmbedding(embedText);
const embeddingValue = embedding ? formatEmbedding(embedding) : null;
if (embeddingValue) {
await execute(
`INSERT INTO project_archives
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`,
[
project,
archive_type,
title,
content,
original_path || null,
file_size || null,
archived_by_session || null,
JSON.stringify(metadata || {}),
embeddingValue
]
);
} else {
await execute(
`INSERT INTO project_archives
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`,
[
project,
archive_type,
title,
content,
original_path || null,
file_size || null,
archived_by_session || null,
JSON.stringify(metadata || {})
]
);
}
await execute(
`INSERT INTO project_archives
(project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding, content_hash)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
[
project,
archive_type,
title,
content,
original_path || null,
file_size || null,
archived_by_session || null,
JSON.stringify(metadata || {}),
embeddingValue,
contentHash
]
);
const sizeStr = file_size ? ` (${Math.round(file_size / 1024)}KB)` : '';
return `Archived: [${archive_type}] ${title}${sizeStr}`;