From cb539860d9e23b2b56816aa8bbeb976b7adede99 Mon Sep 17 00:00:00 2001
From: Christian Gick <service@agiliton.eu>
Date: Mon, 9 Mar 2026 16:20:04 +0200
Subject: [PATCH] feat(MAT-140): Bridge voice and text context + capture video
 from text chat

1. Text bot can now capture video frames from active call when user
   types vision-related queries ("siehst du meinen bildschirm", etc.)
2. Voice transcript injected into text bot context during active calls
3. Text messages injected into voice transcript with [typed in chat] prefix
4. Bot text replies injected back into voice transcript

This enables seamless context sharing between voice calls and text chat.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bot.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/bot.py b/bot.py
index e7ac284..f473dfc 100644
--- a/bot.py
+++ b/bot.py
@@ -40,7 +40,7 @@ from nio import (
     ToDeviceError,
 )
 from nio.crypto.attachments import decrypt_attachment
-from livekit import api
+from livekit import api, rtc
 from voice import VoiceSession
 from article_summary import ArticleSummaryHandler
 
@@ -1851,6 +1851,36 @@ class Bot:
                 image_data = (b64, mime)
                 del self._recent_images[room.room_id]
 
+        # If no cached image but user asks about screen/camera, try capturing from active call
+        if not image_data and re.search(
+            r'siehst du|bildschirm|screen|was siehst|kannst du sehen|schau mal|look at|can you see|zeig',
+            body, re.IGNORECASE
+        ):
+            vs = self.voice_sessions.get(room.room_id)
+            if vs and vs._video_track:
+                try:
+                    stream = rtc.VideoStream(vs._video_track)
+                    frame = None
+                    async for f in stream:
+                        frame = f
+                        break
+                    try:
+                        await stream.aclose()
+                    except Exception:
+                        pass
+                    if frame:
+                        from PIL import Image
+                        vf = getattr(frame, 'frame', frame)
+                        rgba = vf.convert(rtc.VideoBufferType.RGBA)
+                        img = Image.frombytes("RGBA", (rgba.width, rgba.height), bytes(rgba.data))
+                        buf = io.BytesIO()
+                        img.convert("RGB").save(buf, format="JPEG", quality=85)
+                        img_b64 = base64.b64encode(buf.getvalue()).decode()
+                        image_data = (img_b64, "image/jpeg")
+                        logger.info("Captured %dx%d frame from active call for text query", rgba.width, rgba.height)
+                except Exception as exc:
+                    logger.warning("Failed to capture frame from call: %s", exc)
+
         # Detect Confluence page links → store page ID for voice session context
         confluence_page_id = None
         conf_long = re.search(r'agiliton\.atlassian\.net/wiki/.*?pages/(\d+)', body)
@@ -1959,9 +1989,17 @@ class Bot:
                     await self._send_text(room.room_id, summary_response)
                     return
 
+        # Inject typed message into active voice session transcript for context sharing
+        vs = self.voice_sessions.get(room.room_id)
+        if vs:
+            vs._transcript.append({"role": "user", "text": f"[typed in chat] {body}"})
+
         await self.client.room_typing(room.room_id, typing_state=True)
         try:
-            await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
+            reply = await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
+            # Also inject bot's text reply into voice transcript
+            if reply and vs:
+                vs._transcript.append({"role": "assistant", "text": f"[replied in chat] {reply}"})
         finally:
             await self.client.room_typing(room.room_id, typing_state=False)
 
@@ -2674,6 +2712,22 @@ class Bot:
                 "You CAN access and read these documents — never say you cannot."
             )
 
+        # Inject voice call transcript if there's an active call in this room
+        voice_context = ""
+        vs = self.voice_sessions.get(room.room_id)
+        if vs:
+            transcript = vs.get_transcript()
+            if transcript:
+                recent = transcript[-20:]  # last 20 entries
+                lines = [f"{'User' if e['role'] == 'user' else 'Assistant'}: {e['text']}"
+                         for e in recent]
+                voice_context = (
+                    "Active voice call transcript (recent conversation spoken aloud):\n"
+                    + "\n".join(lines)
+                    + "\n\nThe user is currently in a voice call and also typing in chat. "
+                    "Use the voice transcript as context for their text messages."
+                )
+
         # Build conversation context
         messages = [{"role": "system", "content": SYSTEM_PROMPT}]
         if memory_context:
@@ -2682,6 +2736,8 @@ class Bot:
             messages.append({"role": "system", "content": chunk_context})
         if room_doc_context:
             messages.append({"role": "system", "content": room_doc_context})
+        if voice_context:
+            messages.append({"role": "system", "content": voice_context})
         # When RAG returns documents, limit history to 4 messages (2 exchanges) to prevent
         # stale answer patterns from overriding fresh search results
         history_limit = 4 if doc_context else 10