feat(CF-2394): Add session_transcript_search MCP tool

Hybrid (vector + keyword + rerank) search over indexed session transcripts. Enables context recovery from past sessions without re-reading JSONL files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 08:37:19 +02:00
parent 3613e2aa52
commit 0fad29801e
3 changed files with 190 additions and 0 deletions
--- a/src/index.ts
+++ b/src/index.ts
@@ -81,6 +81,7 @@ import {
  sessionPatternDetection,
 } from './tools/session-docs.js';
 import { archiveAdd, archiveSearch, archiveList, archiveGet } from './tools/archives.js';
+import { transcriptSearch } from './tools/transcripts.js';
 import { projectArchive } from './tools/project-archive.js';

 // Create MCP server
@@ -454,6 +455,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
        );
        break;

+      // Transcripts (CF-2394)
+      case 'session_transcript_search':
+        result = await transcriptSearch({
+          query: a.query,
+          project: a.project,
+          session_issue_key: a.session_issue_key,
+          limit: a.limit,
+          search_mode: a.search_mode,
+        });
+        break;
+
      // Archives
      case 'archive_add':
        result = await archiveAdd({
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -653,6 +653,23 @@ export const toolDefinitions = [
    },
  },

+  // Transcript Tools (CF-2394)
+  {
+    name: 'session_transcript_search',
+    description: 'Search session transcripts (JSONL) using hybrid (vector + keyword) search. Finds past sessions by content — commands run, decisions made, plans discussed. Use when recovering context from prior sessions.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        query: { type: 'string', description: 'Search query (e.g., "hetzner disk resize", "auth migration plan")' },
+        project: { type: 'string', description: 'Filter by project key (optional)' },
+        session_issue_key: { type: 'string', description: 'Filter by session Jira issue key (optional)' },
+        limit: { type: 'number', description: 'Max results (default: 10)' },
+        search_mode: { type: 'string', enum: ['hybrid', 'vector', 'keyword'], description: 'Search mode (default: hybrid)' },
+      },
+      required: ['query'],
+    },
+  },
+
  // Archive Tools
  {
    name: 'archive_add',
--- a/src/tools/transcripts.ts
+++ b/src/tools/transcripts.ts
@@ -0,0 +1,161 @@
+// Session transcript search (CF-2394)
+
+import { query } from '../db.js';
+import { getEmbedding, formatEmbedding, rrfMerge, rerank } from '../embeddings.js';
+
+interface TranscriptSearchArgs {
+  query: string;
+  project?: string;
+  session_issue_key?: string;
+  limit?: number;
+  search_mode?: 'hybrid' | 'vector' | 'keyword';
+}
+
+interface TranscriptRow {
+  id: number;
+  session_uuid: string;
+  session_issue_key: string | null;
+  project_key: string;
+  git_branch: string | null;
+  message_count: number;
+  tool_names: string[] | null;
+  started_at: string | null;
+  similarity?: number;
+  rank?: number;
+  snippet?: string;
+}
+
+export async function transcriptSearch(args: TranscriptSearchArgs): Promise<string> {
+  const { query: searchQuery, project, session_issue_key, limit = 10, search_mode = 'hybrid' } = args;
+
+  const buildFilter = (startIdx: number) => {
+    let where = '';
+    const params: unknown[] = [];
+    let idx = startIdx;
+    if (project) {
+      where += ` AND project_key = $${idx++}`;
+      params.push(project);
+    }
+    if (session_issue_key) {
+      where += ` AND session_issue_key = $${idx++}`;
+      params.push(session_issue_key);
+    }
+    return { where, params, nextIdx: idx };
+  };
+
+  // Vector search
+  let vectorIds: number[] = [];
+  let vectorRows: Map<number, TranscriptRow> = new Map();
+  let embeddingFailed = false;
+
+  if (search_mode !== 'keyword') {
+    const embedding = await getEmbedding(searchQuery);
+    if (embedding) {
+      const embeddingStr = formatEmbedding(embedding);
+      const filter = buildFilter(3);
+      const params: unknown[] = [embeddingStr, limit, ...filter.params];
+
+      const rows = await query<TranscriptRow>(
+        `SELECT id, session_uuid, session_issue_key, project_key, git_branch,
+                message_count, tool_names,
+                to_char(started_at, 'YYYY-MM-DD HH24:MI') as started_at,
+                1 - (embedding <=> $1) as similarity
+         FROM session_transcripts
+         WHERE embedding IS NOT NULL${filter.where}
+         ORDER BY embedding <=> $1
+         LIMIT $2`,
+        params
+      );
+      vectorIds = rows.map(r => r.id);
+      for (const r of rows) vectorRows.set(r.id, r);
+    } else {
+      embeddingFailed = true;
+      if (search_mode === 'vector') {
+        return 'Error: Could not generate embedding for vector search';
+      }
+    }
+  }
+
+  // Keyword search
+  let keywordIds: number[] = [];
+  let keywordRows: Map<number, TranscriptRow> = new Map();
+
+  if (search_mode !== 'vector') {
+    const filter = buildFilter(3);
+    const params: unknown[] = [searchQuery, limit, ...filter.params];
+
+    const rows = await query<TranscriptRow>(
+      `SELECT id, session_uuid, session_issue_key, project_key, git_branch,
+              message_count, tool_names,
+              to_char(started_at, 'YYYY-MM-DD HH24:MI') as started_at,
+              ts_rank(tsv, plainto_tsquery('english', $1)) as rank,
+              ts_headline('english', searchable_content,
+                  plainto_tsquery('english', $1),
+                  'StartSel=**,StopSel=**,MaxWords=25,MinWords=8') as snippet
+       FROM session_transcripts
+       WHERE tsv @@ plainto_tsquery('english', $1)${filter.where}
+       ORDER BY rank DESC
+       LIMIT $2`,
+      params
+    );
+    keywordIds = rows.map(r => r.id);
+    for (const r of rows) keywordRows.set(r.id, r);
+  }
+
+  // Merge results
+  let finalIds: number[];
+  let searchLabel: string;
+
+  if (search_mode === 'hybrid' && vectorIds.length > 0 && keywordIds.length > 0) {
+    const merged = rrfMerge(vectorIds, keywordIds);
+    finalIds = merged.map(m => m.id as number);
+    searchLabel = 'hybrid';
+
+    // Re-rank using snippets
+    const docs = finalIds.map(id => {
+      const r = keywordRows.get(id) || vectorRows.get(id);
+      return r?.snippet || r?.session_issue_key || '';
+    });
+    const reranked = await rerank(searchQuery, docs, limit);
+    if (reranked) {
+      finalIds = reranked.map(r => finalIds[r.index]);
+      searchLabel = 'hybrid+rerank';
+    } else {
+      finalIds = finalIds.slice(0, limit);
+    }
+  } else if (vectorIds.length > 0) {
+    finalIds = vectorIds;
+    searchLabel = 'vector';
+  } else if (keywordIds.length > 0) {
+    finalIds = keywordIds;
+    searchLabel = embeddingFailed ? 'keyword (embedding unavailable)' : 'keyword';
+  } else {
+    return 'No matching transcripts found';
+  }
+
+  // Format output
+  const lines = [`Session transcripts (${searchLabel}, ${finalIds.length} results):\n`];
+  for (const id of finalIds) {
+    const r = vectorRows.get(id) || keywordRows.get(id);
+    if (!r) continue;
+
+    const scoreParts: string[] = [];
+    if (vectorRows.has(id)) scoreParts.push(`${Math.round(vectorRows.get(id)!.similarity! * 100)}% semantic`);
+    if (keywordRows.has(id)) scoreParts.push(`rank: ${keywordRows.get(id)!.rank!.toFixed(3)}`);
+    const scores = scoreParts.length > 0 ? ` (${scoreParts.join(', ')})` : '';
+
+    const issueLink = r.session_issue_key
+      ? `[${r.session_issue_key}](https://agiliton.atlassian.net/browse/${r.session_issue_key})`
+      : 'unlinked';
+    const tools = r.tool_names?.slice(0, 5).join(', ') || 'none';
+
+    lines.push(`**#${r.id}** ${issueLink} — ${r.project_key} (${r.git_branch || 'no-branch'})${scores}`);
+    lines.push(`  ${r.started_at || 'unknown date'} | ${r.message_count} msgs | Tools: ${tools}`);
+    if (r.snippet) {
+      lines.push(`  > ${r.snippet.replace(/\n/g, ' ').substring(0, 150)}`);
+    }
+    lines.push('');
+  }
+
+  return lines.join('\n');
+}