From 90e662be9627fec42cd1aa5f3f6f8456f6f0ca66 Mon Sep 17 00:00:00 2001
From: Christian Gick <service@agiliton.eu>
Date: Mon, 23 Feb 2026 11:21:31 +0200
Subject: [PATCH] feat(voice): PDF context in voice calls + call transcript
 summary (MAT-10)

Pass PDF document context from room to voice session so the voice LLM
can answer questions about uploaded PDFs. Persist call transcripts and
post an LLM-generated summary to the room when the call ends.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bot.py   | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 voice.py | 16 ++++++++++++++--
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/bot.py b/bot.py
index 8f5dd1a..69569f0 100644
--- a/bot.py
+++ b/bot.py
@@ -281,6 +281,7 @@ class Bot:
         self._pending_connects: dict[str, str] = {}  # matrix_user_id -> device_code
         self._pending_translate: dict[str, dict] = {}  # sender -> {text, detected_lang, room_id}
         self._pending_reply: dict[str, dict] = {}  # sender -> {target_lang}
+        self._room_pdf_context: dict[str, dict] = {}  # room_id -> {filename, text, timestamp}
 
     @staticmethod
     def _load_user_keys() -> dict[str, str]:
@@ -462,6 +463,14 @@ class Bot:
                     import secrets
                     bot_key = secrets.token_bytes(16)
 
+                    # Get PDF context if recently uploaded (within 1 hour)
+                    pdf_ctx = self._room_pdf_context.get(room_id, {})
+                    pdf_text = None
+                    if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600:
+                        pdf_text = pdf_ctx.get("text")
+                        logger.info("Passing PDF context to voice session: %s (%d chars)",
+                                    pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0)
+
                     vs = VoiceSession(
                         nio_client=self.client,
                         room_id=room_id,
@@ -473,6 +482,7 @@ class Bot:
                             self._publish_encryption_key(rid, key)),
                         memory=self.memory,
                         caller_user_id=event.sender,
+                        document_context=pdf_text,
                     )
 
                     # Check timeline for caller's key
@@ -506,11 +516,19 @@ class Bot:
                 self.active_callers.pop(room_id, None)
                 vs = self.voice_sessions.pop(room_id, None)
                 if vs:
+                    transcript = vs.get_transcript()
                     try:
                         await vs.stop()
                         logger.info("Voice session stopped for %s", room_id)
                     except Exception:
                         logger.exception("Failed to stop voice session for %s", room_id)
+                    # Post call summary to room
+                    if transcript:
+                        try:
+                            summary = await self._summarize_call(transcript, room_id)
+                            await self._send_text(room_id, f"**Anruf-Zusammenfassung:**\n\n{summary}")
+                        except Exception:
+                            logger.exception("Failed to post call summary for %s", room_id)
 
                 # Leave the call too
                 self.active_calls.discard(room_id)
@@ -965,6 +983,13 @@ class Bot:
         if len(pdf_text) > 50000:
             pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
 
+        # Store PDF context for voice session pickup
+        self._room_pdf_context[room.room_id] = {
+            "filename": filename,
+            "text": pdf_text,
+            "timestamp": time.time(),
+        }
+
         user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
 
         await self.client.room_typing(room.room_id, typing_state=True)
@@ -1458,6 +1483,38 @@ class Bot:
             content=content,
         )
 
+    async def _summarize_call(self, transcript: list[dict], room_id: str) -> str:
+        """Generate a concise summary of a voice call transcript via LLM."""
+        # Format transcript for the LLM
+        lines = []
+        for entry in transcript[-30:]:  # last 30 exchanges max
+            role = "Nutzer" if entry["role"] == "user" else "Assistent"
+            lines.append(f"{role}: {entry['text']}")
+        transcript_text = "\n".join(lines)
+
+        if not self.llm:
+            # Fallback: return last 10 exchanges as raw transcript
+            return "\n".join(lines[-20:])
+
+        try:
+            model = self.room_models.get(room_id, DEFAULT_MODEL)
+            resp = await self.llm.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": (
+                        "Fasse das folgende Anruf-Transkript kurz und praegnant zusammen. "
+                        "Nenne die wichtigsten besprochenen Punkte, Entscheidungen und offene Fragen. "
+                        "Antworte in der Sprache des Gespraechs. Maximal 5-8 Saetze."
+                    )},
+                    {"role": "user", "content": transcript_text},
+                ],
+                max_tokens=500,
+            )
+            return resp.choices[0].message.content.strip()
+        except Exception:
+            logger.warning("Call summary LLM failed, falling back to raw transcript", exc_info=True)
+            return "\n".join(lines[-20:])
+
     async def _send_text(self, room_id: str, text: str):
         await self.client.room_send(
             room_id,
diff --git a/voice.py b/voice.py
index 1b628a2..d6db7df 100644
--- a/voice.py
+++ b/voice.py
@@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
 class VoiceSession:
     def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
                  publish_key_cb=None, bot_key: bytes | None = None,
-                 memory=None, caller_user_id: str | None = None):
+                 memory=None, caller_user_id: str | None = None,
+                 document_context: str | None = None):
         self.nio_client = nio_client
         self.room_id = room_id
         self.device_id = device_id
@@ -274,6 +275,8 @@ class VoiceSession:
         self._publish_key_cb = publish_key_cb
         self._memory = memory          # MemoryClient instance from bot.py
         self._caller_user_id = caller_user_id  # Matrix user ID for memory lookup
+        self._document_context = document_context  # PDF text from room for voice context
+        self._transcript: list[dict] = []  # {"role": "user"|"assistant", "text": "..."}
 
     def on_encryption_key(self, sender, device_id, key, index):
         """Receive E2EE key from Element Call participant.
@@ -382,6 +385,10 @@ class VoiceSession:
             except asyncio.CancelledError:
                 pass
 
+    def get_transcript(self) -> list[dict]:
+        """Return the call transcript as a list of {role, text} dicts."""
+        return list(self._transcript)
+
     async def _run(self):
         try:
             user_id = self.nio_client.user_id
@@ -648,6 +655,7 @@ class VoiceSession:
                     logger.info("USER_SPEECH: %s", text)
                 if ev.transcript:
                     _last_user_speech.append(ev.transcript)
+                    self._transcript.append({"role": "user", "text": ev.transcript})
 
             @self.session.on("conversation_item_added")
             def _on_conversation_item(ev):
@@ -655,6 +663,7 @@ class VoiceSession:
                 text = getattr(ev.item, "text_content", "") or ""
                 if role == "assistant" and text:
                     logger.info("AGENT_SPEECH: %s", text)
+                    self._transcript.append({"role": "assistant", "text": text})
                     if self._memory and self._caller_user_id and _last_user_speech:
                         user_text = " ".join(_last_user_speech)
                         _last_user_speech.clear()
@@ -689,8 +698,11 @@ class VoiceSession:
                     await _store_user_pref(caller_uid, "timezone", iana_timezone)
                 return f"Timezone set to {iana_timezone}"
 
+            instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section
+            if self._document_context:
+                instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}"
             agent = _NoiseFilterAgent(
-                instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section,
+                instructions=instructions,
                 tools=[search_web, set_user_timezone],
             )
             io_opts = room_io.RoomOptions(