feat(voice): PDF context in voice calls + call transcript summary (MAT-10)

Pass PDF document context from room to voice session so the voice LLM can answer questions about uploaded PDFs. Persist call transcripts and post an LLM-generated summary to the room when the call ends. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 11:21:31 +02:00
parent 1ec63b93f2
commit 90e662be96
2 changed files with 71 additions and 2 deletions
--- a/voice.py
+++ b/voice.py
@@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
 class VoiceSession:
    def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
                 publish_key_cb=None, bot_key: bytes | None = None,
-                 memory=None, caller_user_id: str | None = None):
+                 memory=None, caller_user_id: str | None = None,
+                 document_context: str | None = None):
        self.nio_client = nio_client
        self.room_id = room_id
        self.device_id = device_id
@@ -274,6 +275,8 @@ class VoiceSession:
        self._publish_key_cb = publish_key_cb
        self._memory = memory          # MemoryClient instance from bot.py
        self._caller_user_id = caller_user_id  # Matrix user ID for memory lookup
+        self._document_context = document_context  # PDF text from room for voice context
+        self._transcript: list[dict] = []  # {"role": "user"|"assistant", "text": "..."}

    def on_encryption_key(self, sender, device_id, key, index):
        """Receive E2EE key from Element Call participant.
@@ -382,6 +385,10 @@ class VoiceSession:
            except asyncio.CancelledError:
                pass

+    def get_transcript(self) -> list[dict]:
+        """Return the call transcript as a list of {role, text} dicts."""
+        return list(self._transcript)
+
    async def _run(self):
        try:
            user_id = self.nio_client.user_id
@@ -648,6 +655,7 @@ class VoiceSession:
                    logger.info("USER_SPEECH: %s", text)
                if ev.transcript:
                    _last_user_speech.append(ev.transcript)
+                    self._transcript.append({"role": "user", "text": ev.transcript})

            @self.session.on("conversation_item_added")
            def _on_conversation_item(ev):
@@ -655,6 +663,7 @@ class VoiceSession:
                text = getattr(ev.item, "text_content", "") or ""
                if role == "assistant" and text:
                    logger.info("AGENT_SPEECH: %s", text)
+                    self._transcript.append({"role": "assistant", "text": text})
                    if self._memory and self._caller_user_id and _last_user_speech:
                        user_text = " ".join(_last_user_speech)
                        _last_user_speech.clear()
@@ -689,8 +698,11 @@ class VoiceSession:
                    await _store_user_pref(caller_uid, "timezone", iana_timezone)
                return f"Timezone set to {iana_timezone}"

+            instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section
+            if self._document_context:
+                instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}"
            agent = _NoiseFilterAgent(
-                instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section,
+                instructions=instructions,
                tools=[search_web, set_user_timezone],
            )
            io_opts = room_io.RoomOptions(