feat(voice): PDF context in voice calls + call transcript summary (MAT-10)
Pass PDF document context from room to voice session so the voice LLM can answer questions about uploaded PDFs. Persist call transcripts and post an LLM-generated summary to the room when the call ends. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
16
voice.py
16
voice.py
@@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
|
||||
class VoiceSession:
|
||||
def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
|
||||
publish_key_cb=None, bot_key: bytes | None = None,
|
||||
memory=None, caller_user_id: str | None = None):
|
||||
memory=None, caller_user_id: str | None = None,
|
||||
document_context: str | None = None):
|
||||
self.nio_client = nio_client
|
||||
self.room_id = room_id
|
||||
self.device_id = device_id
|
||||
@@ -274,6 +275,8 @@ class VoiceSession:
|
||||
self._publish_key_cb = publish_key_cb
|
||||
self._memory = memory # MemoryClient instance from bot.py
|
||||
self._caller_user_id = caller_user_id # Matrix user ID for memory lookup
|
||||
self._document_context = document_context # PDF text from room for voice context
|
||||
self._transcript: list[dict] = [] # {"role": "user"|"assistant", "text": "..."}
|
||||
|
||||
def on_encryption_key(self, sender, device_id, key, index):
|
||||
"""Receive E2EE key from Element Call participant.
|
||||
@@ -382,6 +385,10 @@ class VoiceSession:
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
def get_transcript(self) -> list[dict]:
|
||||
"""Return the call transcript as a list of {role, text} dicts."""
|
||||
return list(self._transcript)
|
||||
|
||||
async def _run(self):
|
||||
try:
|
||||
user_id = self.nio_client.user_id
|
||||
@@ -648,6 +655,7 @@ class VoiceSession:
|
||||
logger.info("USER_SPEECH: %s", text)
|
||||
if ev.transcript:
|
||||
_last_user_speech.append(ev.transcript)
|
||||
self._transcript.append({"role": "user", "text": ev.transcript})
|
||||
|
||||
@self.session.on("conversation_item_added")
|
||||
def _on_conversation_item(ev):
|
||||
@@ -655,6 +663,7 @@ class VoiceSession:
|
||||
text = getattr(ev.item, "text_content", "") or ""
|
||||
if role == "assistant" and text:
|
||||
logger.info("AGENT_SPEECH: %s", text)
|
||||
self._transcript.append({"role": "assistant", "text": text})
|
||||
if self._memory and self._caller_user_id and _last_user_speech:
|
||||
user_text = " ".join(_last_user_speech)
|
||||
_last_user_speech.clear()
|
||||
@@ -689,8 +698,11 @@ class VoiceSession:
|
||||
await _store_user_pref(caller_uid, "timezone", iana_timezone)
|
||||
return f"Timezone set to {iana_timezone}"
|
||||
|
||||
instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section
|
||||
if self._document_context:
|
||||
instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}"
|
||||
agent = _NoiseFilterAgent(
|
||||
instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section,
|
||||
instructions=instructions,
|
||||
tools=[search_web, set_user_timezone],
|
||||
)
|
||||
io_opts = room_io.RoomOptions(
|
||||
|
||||
Reference in New Issue
Block a user