From cb539860d9e23b2b56816aa8bbeb976b7adede99 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Mon, 9 Mar 2026 16:20:04 +0200 Subject: [PATCH] feat(MAT-140): Bridge voice and text context + capture video from text chat 1. Text bot can now capture video frames from active call when user types vision-related queries ("siehst du meinen bildschirm", etc.) 2. Voice transcript injected into text bot context during active calls 3. Text messages injected into voice transcript with [typed in chat] prefix 4. Bot text replies injected back into voice transcript This enables seamless context sharing between voice calls and text chat. Co-Authored-By: Claude Opus 4.6 --- bot.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/bot.py b/bot.py index e7ac284..f473dfc 100644 --- a/bot.py +++ b/bot.py @@ -40,7 +40,7 @@ from nio import ( ToDeviceError, ) from nio.crypto.attachments import decrypt_attachment -from livekit import api +from livekit import api, rtc from voice import VoiceSession from article_summary import ArticleSummaryHandler @@ -1851,6 +1851,36 @@ class Bot: image_data = (b64, mime) del self._recent_images[room.room_id] + # If no cached image but user asks about screen/camera, try capturing from active call + if not image_data and re.search( + r'siehst du|bildschirm|screen|was siehst|kannst du sehen|schau mal|look at|can you see|zeig', + body, re.IGNORECASE + ): + vs = self.voice_sessions.get(room.room_id) + if vs and vs._video_track: + try: + stream = rtc.VideoStream(vs._video_track) + frame = None + async for f in stream: + frame = f + break + try: + await stream.aclose() + except Exception: + pass + if frame: + from PIL import Image + vf = getattr(frame, 'frame', frame) + rgba = vf.convert(rtc.VideoBufferType.RGBA) + img = Image.frombytes("RGBA", (rgba.width, rgba.height), bytes(rgba.data)) + buf = io.BytesIO() + img.convert("RGB").save(buf, format="JPEG", quality=85) + img_b64 = base64.b64encode(buf.getvalue()).decode() + image_data = (img_b64, "image/jpeg") + logger.info("Captured %dx%d frame from active call for text query", rgba.width, rgba.height) + except Exception as exc: + logger.warning("Failed to capture frame from call: %s", exc) + # Detect Confluence page links → store page ID for voice session context confluence_page_id = None conf_long = re.search(r'agiliton\.atlassian\.net/wiki/.*?pages/(\d+)', body) @@ -1959,9 +1989,17 @@ class Bot: await self._send_text(room.room_id, summary_response) return + # Inject typed message into active voice session transcript for context sharing + vs = self.voice_sessions.get(room.room_id) + if vs: + vs._transcript.append({"role": "user", "text": f"[typed in chat] {body}"}) + await self.client.room_typing(room.room_id, typing_state=True) try: - await self._respond_with_ai(room, body, sender=sender, image_data=image_data) + reply = await self._respond_with_ai(room, body, sender=sender, image_data=image_data) + # Also inject bot's text reply into voice transcript + if reply and vs: + vs._transcript.append({"role": "assistant", "text": f"[replied in chat] {reply}"}) finally: await self.client.room_typing(room.room_id, typing_state=False) @@ -2674,6 +2712,22 @@ class Bot: "You CAN access and read these documents — never say you cannot." ) + # Inject voice call transcript if there's an active call in this room + voice_context = "" + vs = self.voice_sessions.get(room.room_id) + if vs: + transcript = vs.get_transcript() + if transcript: + recent = transcript[-20:] # last 20 entries + lines = [f"{'User' if e['role'] == 'user' else 'Assistant'}: {e['text']}" + for e in recent] + voice_context = ( + "Active voice call transcript (recent conversation spoken aloud):\n" + + "\n".join(lines) + + "\n\nThe user is currently in a voice call and also typing in chat. " + "Use the voice transcript as context for their text messages." + ) + # Build conversation context messages = [{"role": "system", "content": SYSTEM_PROMPT}] if memory_context: @@ -2682,6 +2736,8 @@ class Bot: messages.append({"role": "system", "content": chunk_context}) if room_doc_context: messages.append({"role": "system", "content": room_doc_context}) + if voice_context: + messages.append({"role": "system", "content": voice_context}) # When RAG returns documents, limit history to 4 messages (2 exchanges) to prevent # stale answer patterns from overriding fresh search results history_limit = 4 if doc_context else 10