feat(voice): PDF context in voice calls + call transcript summary (MAT-10)

Pass PDF document context from room to voice session so the voice LLM
can answer questions about uploaded PDFs. Persist call transcripts and
post an LLM-generated summary to the room when the call ends.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-23 11:21:31 +02:00
parent 1ec63b93f2
commit 90e662be96
2 changed files with 71 additions and 2 deletions

57
bot.py
View File

@@ -281,6 +281,7 @@ class Bot:
self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code self._pending_connects: dict[str, str] = {} # matrix_user_id -> device_code
self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id} self._pending_translate: dict[str, dict] = {} # sender -> {text, detected_lang, room_id}
self._pending_reply: dict[str, dict] = {} # sender -> {target_lang} self._pending_reply: dict[str, dict] = {} # sender -> {target_lang}
self._room_pdf_context: dict[str, dict] = {} # room_id -> {filename, text, timestamp}
@staticmethod @staticmethod
def _load_user_keys() -> dict[str, str]: def _load_user_keys() -> dict[str, str]:
@@ -462,6 +463,14 @@ class Bot:
import secrets import secrets
bot_key = secrets.token_bytes(16) bot_key = secrets.token_bytes(16)
# Get PDF context if recently uploaded (within 1 hour)
pdf_ctx = self._room_pdf_context.get(room_id, {})
pdf_text = None
if pdf_ctx and time.time() - pdf_ctx.get("timestamp", 0) < 3600:
pdf_text = pdf_ctx.get("text")
logger.info("Passing PDF context to voice session: %s (%d chars)",
pdf_ctx.get("filename", "?"), len(pdf_text) if pdf_text else 0)
vs = VoiceSession( vs = VoiceSession(
nio_client=self.client, nio_client=self.client,
room_id=room_id, room_id=room_id,
@@ -473,6 +482,7 @@ class Bot:
self._publish_encryption_key(rid, key)), self._publish_encryption_key(rid, key)),
memory=self.memory, memory=self.memory,
caller_user_id=event.sender, caller_user_id=event.sender,
document_context=pdf_text,
) )
# Check timeline for caller's key # Check timeline for caller's key
@@ -506,11 +516,19 @@ class Bot:
self.active_callers.pop(room_id, None) self.active_callers.pop(room_id, None)
vs = self.voice_sessions.pop(room_id, None) vs = self.voice_sessions.pop(room_id, None)
if vs: if vs:
transcript = vs.get_transcript()
try: try:
await vs.stop() await vs.stop()
logger.info("Voice session stopped for %s", room_id) logger.info("Voice session stopped for %s", room_id)
except Exception: except Exception:
logger.exception("Failed to stop voice session for %s", room_id) logger.exception("Failed to stop voice session for %s", room_id)
# Post call summary to room
if transcript:
try:
summary = await self._summarize_call(transcript, room_id)
await self._send_text(room_id, f"**Anruf-Zusammenfassung:**\n\n{summary}")
except Exception:
logger.exception("Failed to post call summary for %s", room_id)
# Leave the call too # Leave the call too
self.active_calls.discard(room_id) self.active_calls.discard(room_id)
@@ -965,6 +983,13 @@ class Bot:
if len(pdf_text) > 50000: if len(pdf_text) > 50000:
pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]" pdf_text = pdf_text[:50000] + "\n\n[... truncated, PDF too long ...]"
# Store PDF context for voice session pickup
self._room_pdf_context[room.room_id] = {
"filename": filename,
"text": pdf_text,
"timestamp": time.time(),
}
user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.' user_message = f'The user sent a PDF file named "{filename}". Here is the extracted text:\n\n{pdf_text}\n\nPlease summarize or answer questions about this document.'
await self.client.room_typing(room.room_id, typing_state=True) await self.client.room_typing(room.room_id, typing_state=True)
@@ -1458,6 +1483,38 @@ class Bot:
content=content, content=content,
) )
async def _summarize_call(self, transcript: list[dict], room_id: str) -> str:
"""Generate a concise summary of a voice call transcript via LLM."""
# Format transcript for the LLM
lines = []
for entry in transcript[-30:]: # last 30 exchanges max
role = "Nutzer" if entry["role"] == "user" else "Assistent"
lines.append(f"{role}: {entry['text']}")
transcript_text = "\n".join(lines)
if not self.llm:
# Fallback: return last 10 exchanges as raw transcript
return "\n".join(lines[-20:])
try:
model = self.room_models.get(room_id, DEFAULT_MODEL)
resp = await self.llm.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": (
"Fasse das folgende Anruf-Transkript kurz und praegnant zusammen. "
"Nenne die wichtigsten besprochenen Punkte, Entscheidungen und offene Fragen. "
"Antworte in der Sprache des Gespraechs. Maximal 5-8 Saetze."
)},
{"role": "user", "content": transcript_text},
],
max_tokens=500,
)
return resp.choices[0].message.content.strip()
except Exception:
logger.warning("Call summary LLM failed, falling back to raw transcript", exc_info=True)
return "\n".join(lines[-20:])
async def _send_text(self, room_id: str, text: str): async def _send_text(self, room_id: str, text: str):
await self.client.room_send( await self.client.room_send(
room_id, room_id,

View File

@@ -257,7 +257,8 @@ def _build_e2ee_options() -> rtc.E2EEOptions:
class VoiceSession: class VoiceSession:
def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet", def __init__(self, nio_client, room_id, device_id, lk_url, model="claude-sonnet",
publish_key_cb=None, bot_key: bytes | None = None, publish_key_cb=None, bot_key: bytes | None = None,
memory=None, caller_user_id: str | None = None): memory=None, caller_user_id: str | None = None,
document_context: str | None = None):
self.nio_client = nio_client self.nio_client = nio_client
self.room_id = room_id self.room_id = room_id
self.device_id = device_id self.device_id = device_id
@@ -274,6 +275,8 @@ class VoiceSession:
self._publish_key_cb = publish_key_cb self._publish_key_cb = publish_key_cb
self._memory = memory # MemoryClient instance from bot.py self._memory = memory # MemoryClient instance from bot.py
self._caller_user_id = caller_user_id # Matrix user ID for memory lookup self._caller_user_id = caller_user_id # Matrix user ID for memory lookup
self._document_context = document_context # PDF text from room for voice context
self._transcript: list[dict] = [] # {"role": "user"|"assistant", "text": "..."}
def on_encryption_key(self, sender, device_id, key, index): def on_encryption_key(self, sender, device_id, key, index):
"""Receive E2EE key from Element Call participant. """Receive E2EE key from Element Call participant.
@@ -382,6 +385,10 @@ class VoiceSession:
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
def get_transcript(self) -> list[dict]:
"""Return the call transcript as a list of {role, text} dicts."""
return list(self._transcript)
async def _run(self): async def _run(self):
try: try:
user_id = self.nio_client.user_id user_id = self.nio_client.user_id
@@ -648,6 +655,7 @@ class VoiceSession:
logger.info("USER_SPEECH: %s", text) logger.info("USER_SPEECH: %s", text)
if ev.transcript: if ev.transcript:
_last_user_speech.append(ev.transcript) _last_user_speech.append(ev.transcript)
self._transcript.append({"role": "user", "text": ev.transcript})
@self.session.on("conversation_item_added") @self.session.on("conversation_item_added")
def _on_conversation_item(ev): def _on_conversation_item(ev):
@@ -655,6 +663,7 @@ class VoiceSession:
text = getattr(ev.item, "text_content", "") or "" text = getattr(ev.item, "text_content", "") or ""
if role == "assistant" and text: if role == "assistant" and text:
logger.info("AGENT_SPEECH: %s", text) logger.info("AGENT_SPEECH: %s", text)
self._transcript.append({"role": "assistant", "text": text})
if self._memory and self._caller_user_id and _last_user_speech: if self._memory and self._caller_user_id and _last_user_speech:
user_text = " ".join(_last_user_speech) user_text = " ".join(_last_user_speech)
_last_user_speech.clear() _last_user_speech.clear()
@@ -689,8 +698,11 @@ class VoiceSession:
await _store_user_pref(caller_uid, "timezone", iana_timezone) await _store_user_pref(caller_uid, "timezone", iana_timezone)
return f"Timezone set to {iana_timezone}" return f"Timezone set to {iana_timezone}"
instructions = _build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section
if self._document_context:
instructions += f"\n\nDokument-Kontext (PDF im Raum hochgeladen):\n{self._document_context}"
agent = _NoiseFilterAgent( agent = _NoiseFilterAgent(
instructions=_build_voice_prompt(model=self.model, timezone=user_timezone) + memory_section, instructions=instructions,
tools=[search_web, set_user_timezone], tools=[search_web, set_user_timezone],
) )
io_opts = room_io.RoomOptions( io_opts = room_io.RoomOptions(