session-mcp/src/tools/archives.ts

// Project archives operations for database-backed archival

import { query, queryOne, execute } from '../db.js';
import { getEmbedding, formatEmbedding, generateContentHash, rrfMerge, rerank } from '../embeddings.js';

type ArchiveType = 'session' | 'research' | 'audit' | 'investigation' | 'completed' | 'migration';

interface Archive {
  id: number;
  project_key: string;
  archive_type: ArchiveType;
  title: string;
  content: string;
  original_path: string | null;
  file_size: number | null;
  archived_at: string;
  archived_by_session: string | null;
  metadata: Record<string, unknown>;
  created_at: string;
  updated_at: string;
}

interface ArchiveAddArgs {
  project: string;
  archive_type: ArchiveType;
  title: string;
  content: string;
  original_path?: string;
  file_size?: number;
  archived_by_session?: string;
  metadata?: Record<string, unknown>;
}

type SearchMode = 'hybrid' | 'vector' | 'keyword';

interface ArchiveSearchArgs {
  query: string;
  project?: string;
  archive_type?: ArchiveType;
  limit?: number;
  search_mode?: SearchMode;
}

interface ArchiveListArgs {
  project?: string;
  archive_type?: ArchiveType;
  since?: string;
  limit?: number;
}

interface ArchiveGetArgs {
  id: number;
}

/**
 * Verify project exists
 */
async function verifyProject(projectKey: string): Promise<boolean> {
  const result = await queryOne<{ key: string }>(
    'SELECT key FROM projects WHERE key = $1',
    [projectKey]
  );
  return !!result;
}

/**
 * Add a new archive entry
 */
export async function archiveAdd(args: ArchiveAddArgs): Promise<string> {
  const { project, archive_type, title, content, original_path, file_size, archived_by_session, metadata } = args;

  // Verify project exists
  const exists = await verifyProject(project);
  if (!exists) {
    return `Error: Project not found: ${project}`;
  }

  // CF-1314: Hash content for dedup before embedding API call
  const embedText = `${title}. ${content.substring(0, 1000)}`;
  const contentHash = generateContentHash(embedText);

  const existing = await queryOne<{ id: number }>(
    'SELECT id FROM project_archives WHERE content_hash = $1 AND project_key = $2 LIMIT 1',
    [contentHash, project]
  );
  if (existing) {
    return `Archive already exists (id: ${existing.id}): [${archive_type}] ${title}`;
  }

  // Generate embedding for semantic search
  const embedding = await getEmbedding(embedText);
  const embeddingValue = embedding ? formatEmbedding(embedding) : null;

  await execute(
    `INSERT INTO project_archives
     (project_key, archive_type, title, content, original_path, file_size, archived_by_session, metadata, embedding, content_hash)
     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`,
    [
      project,
      archive_type,
      title,
      content,
      original_path || null,
      file_size || null,
      archived_by_session || null,
      JSON.stringify(metadata || {}),
      embeddingValue,
      contentHash
    ]
  );

  const sizeStr = file_size ? ` (${Math.round(file_size / 1024)}KB)` : '';
  return `Archived: [${archive_type}] ${title}${sizeStr}`;
}

/**
 * Search archives with hybrid (vector + keyword), vector-only, or keyword-only mode (CF-1315)
 */
export async function archiveSearch(args: ArchiveSearchArgs): Promise<string> {
  const { query: searchQuery, project, archive_type, limit = 5, search_mode = 'hybrid' } = args;

  // Build shared filter clause
  const buildFilter = (startIdx: number) => {
    let where = '';
    const params: unknown[] = [];
    let idx = startIdx;
    if (project) {
      where += ` AND project_key = $${idx++}`;
      params.push(project);
    }
    if (archive_type) {
      where += ` AND archive_type = $${idx++}`;
      params.push(archive_type);
    }
    return { where, params, nextIdx: idx };
  };

  // Vector search
  let vectorIds: number[] = [];
  let vectorRows: Map<number, Archive & { similarity: number }> = new Map();
  let embeddingFailed = false;

  if (search_mode !== 'keyword') {
    const embedding = await getEmbedding(searchQuery);
    if (embedding) {
      const embeddingStr = formatEmbedding(embedding);
      const filter = buildFilter(3);
      const params: unknown[] = [embeddingStr, limit, ...filter.params];

      const rows = await query<Archive & { similarity: number }>(
        `SELECT id, archive_type, title, original_path, file_size,
                to_char(archived_at, 'YYYY-MM-DD') as archived_at,
                1 - (embedding <=> $1) as similarity
         FROM project_archives
         WHERE embedding IS NOT NULL${filter.where}
         ORDER BY embedding <=> $1
         LIMIT $2`,
        params
      );
      vectorIds = rows.map(r => r.id);
      for (const r of rows) vectorRows.set(r.id, r);
    } else {
      embeddingFailed = true;
      if (search_mode === 'vector') {
        return 'Error: Could not generate embedding for vector search';
      }
    }
  }

  // Keyword search
  let keywordIds: number[] = [];
  let keywordRows: Map<number, Archive & { rank: number }> = new Map();

  if (search_mode !== 'vector') {
    const filter = buildFilter(3);
    const params: unknown[] = [searchQuery, limit, ...filter.params];

    const rows = await query<Archive & { rank: number }>(
      `SELECT id, archive_type, title, original_path, file_size,
              to_char(archived_at, 'YYYY-MM-DD') as archived_at,
              ts_rank(search_vector, plainto_tsquery('english', $1)) as rank
       FROM project_archives
       WHERE search_vector @@ plainto_tsquery('english', $1)${filter.where}
       ORDER BY rank DESC
       LIMIT $2`,
      params
    );
    keywordIds = rows.map(r => r.id);
    for (const r of rows) keywordRows.set(r.id, r);
  }

  // Merge results
  let finalIds: number[];
  let searchLabel: string;

  let rerankScores: Map<number, number> | null = null;

  if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) {
    const merged = rrfMerge(vectorIds, keywordIds);
    finalIds = merged.map(m => m.id as number);
    searchLabel = 'hybrid';

    // Cross-encoder re-ranking (CF-1317)
    const docs = finalIds.map(id => {
      const r = vectorRows.get(id) || keywordRows.get(id);
      return (r as any)?.title || '';
    });
    const reranked = await rerank(searchQuery, docs, limit);
    if (reranked) {
      rerankScores = new Map();
      const reorderedIds = reranked.map(r => {
        rerankScores!.set(finalIds[r.index], r.relevance_score);
        return finalIds[r.index];
      });
      finalIds = reorderedIds;
      searchLabel = 'hybrid+rerank';
    } else {
      finalIds = finalIds.slice(0, limit);
    }
  } else if (vectorIds.length > 0) {
    finalIds = vectorIds;
    searchLabel = 'vector';
  } else if (keywordIds.length > 0) {
    finalIds = keywordIds;
    searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword';
  } else {
    return 'No relevant archives found';
  }

  // Format output
  const lines = [`Relevant archives (${searchLabel}):\n`];
  for (const id of finalIds) {
    const a = vectorRows.get(id) || keywordRows.get(id);
    if (!a) continue;
    const simParts: string[] = [];
    if (vectorRows.has(id)) simParts.push(`${Math.round((vectorRows.get(id)!).similarity * 100)}% match`);
    if (rerankScores?.has(id)) simParts.push(`rerank: ${rerankScores.get(id)!.toFixed(2)}`);
    const scores = simParts.length > 0 ? ` (${simParts.join(', ')})` : '';
    const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
    lines.push(`**[${a.archive_type}]** ${a.title}${scores}`);
    lines.push(`  Archived: ${a.archived_at}${sizeStr}`);
    if (a.original_path) {
      lines.push(`  Path: ${a.original_path}`);
    }
    lines.push('');
  }

  return lines.join('\n');
}

/**
 * List archives (non-semantic)
 */
export async function archiveList(args: ArchiveListArgs): Promise<string> {
  const { project, archive_type, since, limit = 20 } = args;

  let whereClause = 'WHERE 1=1';
  const params: unknown[] = [];
  let paramIndex = 1;

  if (project) {
    whereClause += ` AND project_key = $${paramIndex++}`;
    params.push(project);
  }
  if (archive_type) {
    whereClause += ` AND archive_type = $${paramIndex++}`;
    params.push(archive_type);
  }
  if (since) {
    whereClause += ` AND archived_at >= $${paramIndex++}`;
    params.push(since);
  }

  params.push(limit);

  const archives = await query<Archive>(
    `SELECT id, archive_type, title, original_path, file_size,
            to_char(archived_at, 'YYYY-MM-DD') as archived_at
     FROM project_archives
     ${whereClause}
     ORDER BY archived_at DESC
     LIMIT $${paramIndex}`,
    params
  );

  if (archives.length === 0) {
    return `No archives found${project ? ` for project ${project}` : ''}`;
  }

  const lines = [`Archives${project ? ` (${project})` : ''}:\n`];
  for (const a of archives) {
    const sizeStr = a.file_size ? ` (${Math.round(a.file_size / 1024)}KB)` : '';
    lines.push(`• [${a.archive_type}] ${a.title} - ${a.archived_at}${sizeStr}`);
    if (a.original_path) {
      lines.push(`  ${a.original_path}`);
    }
  }

  return lines.join('\n');
}

/**
 * Get specific archive by ID
 */
export async function archiveGet(args: ArchiveGetArgs): Promise<string> {
  const archive = await queryOne<Archive>(
    `SELECT id, project_key, archive_type, title, content, original_path, file_size,
            to_char(archived_at, 'YYYY-MM-DD') as archived_at,
            archived_by_session, metadata
     FROM project_archives
     WHERE id = $1`,
    [args.id]
  );

  if (!archive) {
    return `Archive not found: ${args.id}`;
  }

  const sizeStr = archive.file_size ? ` (${Math.round(archive.file_size / 1024)}KB)` : '';
  const lines = [
    `# Archive #${archive.id}\n`,
    `**Type:** ${archive.archive_type}`,
    `**Title:** ${archive.title}`,
    `**Archived:** ${archive.archived_at}${sizeStr}`,
  ];

  if (archive.original_path) {
    lines.push(`**Original Path:** ${archive.original_path}`);
  }
  if (archive.archived_by_session) {
    lines.push(`**Session:** ${archive.archived_by_session}`);
  }

  lines.push('\n---\n');
  lines.push(archive.content);

  return lines.join('\n');
}