feat(voice): PDF context in voice calls + call transcript summary (MAT-10)

Pass PDF document context from room to voice session so the voice LLM
can answer questions about uploaded PDFs. Persist call transcripts and
post an LLM-generated summary to the room when the call ends.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-23 11:21:31 +02:00
parent 1ec63b93f2
commit 90e662be96
2 changed files with 71 additions and 2 deletions

View File

@@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
class VoiceSession:
def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
publish_key_cb=None, bot_key: bytes | None = None,
memory=None, caller_user_id: str | None = None):
memory=None, caller_user_id: str | None = None,
document_context: str | None = None):
self.nio_client = nio_client
self.room_id = room_id
self.device_id = device_id
@@ -274,6 +275,8 @@ class VoiceSession:
self._publish_key_cb = publish_key_cb
self._memory = memory # MemoryClient instance from bot.py
self._caller_user_id = caller_user_id # Matrix user ID for memory lookup
self._document_context = document_context # PDF text from room for voice context
self._transcript: list[dict] = [] # {"role": "user"|"assistant", "text": "..."}
def on_encryption_key(self, sender, device_id, key, index):
"""Receive E2EE key from Element Call participant.
@@ -382,6 +385,10 @@ class VoiceSession:
except asyncio.CancelledError:
pass
def get_transcript(self) -> list[dict]:
"""Return the call transcript as a list of {role, text} dicts."""
return list(self._transcript)
async def _run(self):
try:
user_id = self.nio_client.user_id
@@ -648,6 +655,7 @@ class VoiceSession:
logger.info("USER_SPEECH: %s", text)
if ev.transcript:
_last_user_speech.append(ev.transcript)
self._transcript.append({"role": "user", "text": ev.transcript})
@self.session.on("conversation_item_added")
def _on_conversation_item(ev):
@@ -655,6 +663,7 @@ class VoiceSession:
text = getattr(ev.item, "text_content", "") or ""
if role == "assistant" and text:
logger.info("AGENT_SPEECH: %s", text)
self._transcript.append({"role": "assistant", "text": text})
if self._memory and self._caller_user_id and _last_user_speech:
user_text = " ".join(_last_user_speech)
_last_user_speech.clear()
@@ -689,8 +698,11 @@ class VoiceSession:
await _store_user_pref(caller_uid, "timezone", iana_timezone)
return f"Timezone set to {iana_timezone}"
instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section
if self._document_context:
instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}"
agent = _NoiseFilterAgent(
instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section,
instructions=instructions,
tools=[search_web, set_user_timezone],
)
io_opts = room_io.RoomOptions(