From 90e662be9627fec42cd1aa5f3f6f8456f6f0ca66 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Mon, 23 Feb 2026 11:21:31 +0200 Subject: [PATCH] feat(voice): PDF context in voice calls + call transcript summary (MAT-10) Pass PDF document context from room to voice session so the voice LLM can answer questions about uploaded PDFs. Persist call transcripts and post an LLM-generated summary to the room when the call ends. Co-Authored-By: Claude Opus 4.6 --- bot.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ voice.py | 16 ++++++++++++++-- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/bot.py b/bot.py index 8f5dd1a..69569f0 100644 --- a/bot.py +++ b/bot.py @@ -281,6 +281,7 @@ class Bot: self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id} self._pending_reply: dict[str, dict] = {} # sender -> {target_lang} + self._room_pdf_context: dict[str, dict] = {} # room_id -> {filename, text, timestamp} @staticmethod def _load_user_keys() -> dict[str, str]: @@ -462,6 +463,14 @@ class Bot: import secrets bot_key = secrets.token_bytes(16) + # Get PDF context if recently uploaded (within 1 hour) + pdf_ctx = self._room_pdf_context.get(room_id, {}) + pdf_text = None + if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600: + pdf_text = pdf_ctx.get("text") + logger.info("Passing PDF context to voice session: %s (%d chars)", + pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0) + vs = VoiceSession( nio_client=self.client, room_id=room_id, @@ -473,6 +482,7 @@ class Bot: self._publish_encryption_key(rid, key)), memory=self.memory, caller_user_id=event.sender, + document_context=pdf_text, ) # Check timeline for caller's key @@ -506,11 +516,19 @@ class Bot: self.active_callers.pop(room_id, None) vs = self.voice_sessions.pop(room_id, None) if vs: + transcript = vs.get_transcript() try: await vs.stop() logger.info("Voice session stopped for %s", room_id) except Exception: logger.exception("Failed to stop voice session for %s", room_id) + # Post call summary to room + if transcript: + try: + summary = await self._summarize_call(transcript, room_id) + await self._send_text(room_id, f"**Anruf-Zusammenfassung:**\n\n{summary}") + except Exception: + logger.exception("Failed to post call summary for %s", room_id) # Leave the call too self.active_calls.discard(room_id) @@ -965,6 +983,13 @@ class Bot: if len(pdf_text) > 50000: pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]" + # Store PDF context for voice session pickup + self._room_pdf_context[room.room_id] = { + "filename": filename, + "text": pdf_text, + "timestamp": time.time(), + } + user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.' await self.client.room_typing(room.room_id, typing_state=True) @@ -1458,6 +1483,38 @@ class Bot: content=content, ) + async def _summarize_call(self, transcript: list[dict], room_id: str) -> str: + """Generate a concise summary of a voice call transcript via LLM.""" + # Format transcript for the LLM + lines = [] + for entry in transcript[-30:]: # last 30 exchanges max + role = "Nutzer" if entry["role"] == "user" else "Assistent" + lines.append(f"{role}: {entry['text']}") + transcript_text = "\n".join(lines) + + if not self.llm: + # Fallback: return last 10 exchanges as raw transcript + return "\n".join(lines[-20:]) + + try: + model = self.room_models.get(room_id, DEFAULT_MODEL) + resp = await self.llm.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": ( + "Fasse das folgende Anruf-Transkript kurz und praegnant zusammen. " + "Nenne die wichtigsten besprochenen Punkte, Entscheidungen und offene Fragen. " + "Antworte in der Sprache des Gespraechs. Maximal 5-8 Saetze." + )}, + {"role": "user", "content": transcript_text}, + ], + max_tokens=500, + ) + return resp.choices[0].message.content.strip() + except Exception: + logger.warning("Call summary LLM failed, falling back to raw transcript", exc_info=True) + return "\n".join(lines[-20:]) + async def _send_text(self, room_id: str, text: str): await self.client.room_send( room_id, diff --git a/voice.py b/voice.py index 1b628a2..d6db7df 100644 --- a/voice.py +++ b/voice.py @@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions: class VoiceSession: def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet", publish_key_cb=None, bot_key: bytes | None = None, - memory=None, caller_user_id: str | None = None): + memory=None, caller_user_id: str | None = None, + document_context: str | None = None): self.nio_client = nio_client self.room_id = room_id self.device_id = device_id @@ -274,6 +275,8 @@ class VoiceSession: self._publish_key_cb = publish_key_cb self._memory = memory # MemoryClient instance from bot.py self._caller_user_id = caller_user_id # Matrix user ID for memory lookup + self._document_context = document_context # PDF text from room for voice context + self._transcript: list[dict] = [] # {"role": "user"|"assistant", "text": "..."} def on_encryption_key(self, sender, device_id, key, index): """Receive E2EE key from Element Call participant. @@ -382,6 +385,10 @@ class VoiceSession: except asyncio.CancelledError: pass + def get_transcript(self) -> list[dict]: + """Return the call transcript as a list of {role, text} dicts.""" + return list(self._transcript) + async def _run(self): try: user_id = self.nio_client.user_id @@ -648,6 +655,7 @@ class VoiceSession: logger.info("USER_SPEECH: %s", text) if ev.transcript: _last_user_speech.append(ev.transcript) + self._transcript.append({"role": "user", "text": ev.transcript}) @self.session.on("conversation_item_added") def _on_conversation_item(ev): @@ -655,6 +663,7 @@ class VoiceSession: text = getattr(ev.item, "text_content", "") or "" if role == "assistant" and text: logger.info("AGENT_SPEECH: %s", text) + self._transcript.append({"role": "assistant", "text": text}) if self._memory and self._caller_user_id and _last_user_speech: user_text = " ".join(_last_user_speech) _last_user_speech.clear() @@ -689,8 +698,11 @@ class VoiceSession: await _store_user_pref(caller_uid, "timezone", iana_timezone) return f"Timezone set to {iana_timezone}" + instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section + if self._document_context: + instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}" agent = _NoiseFilterAgent( - instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section, + instructions=instructions, tools=[search_web, set_user_timezone], ) io_opts = room_io.RoomOptions(