feat(voice): PDF context in voice calls + call transcript summary (MAT-10)
Pass PDF document context from room to voice session so the voice LLM can answer questions about uploaded PDFs. Persist call transcripts and post an LLM-generated summary to the room when the call ends. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
57
bot.py
57
bot.py
@@ -281,6 +281,7 @@ class Bot:
|
||||
self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code
|
||||
self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id}
|
||||
self._pending_reply: dict[str, dict] = {} # sender -> {target_lang}
|
||||
self._room_pdf_context: dict[str, dict] = {} # room_id -> {filename, text, timestamp}
|
||||
|
||||
@staticmethod
|
||||
def _load_user_keys() -> dict[str, str]:
|
||||
@@ -462,6 +463,14 @@ class Bot:
|
||||
import secrets
|
||||
bot_key = secrets.token_bytes(16)
|
||||
|
||||
# Get PDF context if recently uploaded (within 1 hour)
|
||||
pdf_ctx = self._room_pdf_context.get(room_id, {})
|
||||
pdf_text = None
|
||||
if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600:
|
||||
pdf_text = pdf_ctx.get("text")
|
||||
logger.info("Passing PDF context to voice session: %s (%d chars)",
|
||||
pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0)
|
||||
|
||||
vs = VoiceSession(
|
||||
nio_client=self.client,
|
||||
room_id=room_id,
|
||||
@@ -473,6 +482,7 @@ class Bot:
|
||||
self._publish_encryption_key(rid, key)),
|
||||
memory=self.memory,
|
||||
caller_user_id=event.sender,
|
||||
document_context=pdf_text,
|
||||
)
|
||||
|
||||
# Check timeline for caller's key
|
||||
@@ -506,11 +516,19 @@ class Bot:
|
||||
self.active_callers.pop(room_id, None)
|
||||
vs = self.voice_sessions.pop(room_id, None)
|
||||
if vs:
|
||||
transcript = vs.get_transcript()
|
||||
try:
|
||||
await vs.stop()
|
||||
logger.info("Voice session stopped for %s", room_id)
|
||||
except Exception:
|
||||
logger.exception("Failed to stop voice session for %s", room_id)
|
||||
# Post call summary to room
|
||||
if transcript:
|
||||
try:
|
||||
summary = await self._summarize_call(transcript, room_id)
|
||||
await self._send_text(room_id, f"**Anruf-Zusammenfassung:**\n\n{summary}")
|
||||
except Exception:
|
||||
logger.exception("Failed to post call summary for %s", room_id)
|
||||
|
||||
# Leave the call too
|
||||
self.active_calls.discard(room_id)
|
||||
@@ -965,6 +983,13 @@ class Bot:
|
||||
if len(pdf_text) > 50000:
|
||||
pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
|
||||
|
||||
# Store PDF context for voice session pickup
|
||||
self._room_pdf_context[room.room_id] = {
|
||||
"filename": filename,
|
||||
"text": pdf_text,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
|
||||
user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
|
||||
|
||||
await self.client.room_typing(room.room_id, typing_state=True)
|
||||
@@ -1458,6 +1483,38 @@ class Bot:
|
||||
content=content,
|
||||
)
|
||||
|
||||
async def _summarize_call(self, transcript: list[dict], room_id: str) -> str:
|
||||
"""Generate a concise summary of a voice call transcript via LLM."""
|
||||
# Format transcript for the LLM
|
||||
lines = []
|
||||
for entry in transcript[-30:]: # last 30 exchanges max
|
||||
role = "Nutzer" if entry["role"] == "user" else "Assistent"
|
||||
lines.append(f"{role}: {entry['text']}")
|
||||
transcript_text = "\n".join(lines)
|
||||
|
||||
if not self.llm:
|
||||
# Fallback: return last 10 exchanges as raw transcript
|
||||
return "\n".join(lines[-20:])
|
||||
|
||||
try:
|
||||
model = self.room_models.get(room_id, DEFAULT_MODEL)
|
||||
resp = await self.llm.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": (
|
||||
"Fasse das folgende Anruf-Transkript kurz und praegnant zusammen. "
|
||||
"Nenne die wichtigsten besprochenen Punkte, Entscheidungen und offene Fragen. "
|
||||
"Antworte in der Sprache des Gespraechs. Maximal 5-8 Saetze."
|
||||
)},
|
||||
{"role": "user", "content": transcript_text},
|
||||
],
|
||||
max_tokens=500,
|
||||
)
|
||||
return resp.choices[0].message.content.strip()
|
||||
except Exception:
|
||||
logger.warning("Call summary LLM failed, falling back to raw transcript", exc_info=True)
|
||||
return "\n".join(lines[-20:])
|
||||
|
||||
async def _send_text(self, room_id: str, text: str):
|
||||
await self.client.room_send(
|
||||
room_id,
|
||||
|
||||
Reference in New Issue
Block a user