feat(MAT-140): Bridge voice and text context + capture video from text chat
1. Text bot can now capture video frames from active call when user
types vision-related queries ("siehst du meinen bildschirm", etc.)
2. Voice transcript injected into text bot context during active calls
3. Text messages injected into voice transcript with [typed in chat] prefix
4. Bot text replies injected back into voice transcript
This enables seamless context sharing between voice calls and text chat.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
60
bot.py
60
bot.py
@@ -40,7 +40,7 @@ from nio import (
|
|||||||
ToDeviceError,
|
ToDeviceError,
|
||||||
)
|
)
|
||||||
from nio.crypto.attachments import decrypt_attachment
|
from nio.crypto.attachments import decrypt_attachment
|
||||||
from livekit import api
|
from livekit import api, rtc
|
||||||
from voice import VoiceSession
|
from voice import VoiceSession
|
||||||
from article_summary import ArticleSummaryHandler
|
from article_summary import ArticleSummaryHandler
|
||||||
|
|
||||||
@@ -1851,6 +1851,36 @@ class Bot:
|
|||||||
image_data = (b64, mime)
|
image_data = (b64, mime)
|
||||||
del self._recent_images[room.room_id]
|
del self._recent_images[room.room_id]
|
||||||
|
|
||||||
|
# If no cached image but user asks about screen/camera, try capturing from active call
|
||||||
|
if not image_data and re.search(
|
||||||
|
r'siehst du|bildschirm|screen|was siehst|kannst du sehen|schau mal|look at|can you see|zeig',
|
||||||
|
body, re.IGNORECASE
|
||||||
|
):
|
||||||
|
vs = self.voice_sessions.get(room.room_id)
|
||||||
|
if vs and vs._video_track:
|
||||||
|
try:
|
||||||
|
stream = rtc.VideoStream(vs._video_track)
|
||||||
|
frame = None
|
||||||
|
async for f in stream:
|
||||||
|
frame = f
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
await stream.aclose()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if frame:
|
||||||
|
from PIL import Image
|
||||||
|
vf = getattr(frame, 'frame', frame)
|
||||||
|
rgba = vf.convert(rtc.VideoBufferType.RGBA)
|
||||||
|
img = Image.frombytes("RGBA", (rgba.width, rgba.height), bytes(rgba.data))
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.convert("RGB").save(buf, format="JPEG", quality=85)
|
||||||
|
img_b64 = base64.b64encode(buf.getvalue()).decode()
|
||||||
|
image_data = (img_b64, "image/jpeg")
|
||||||
|
logger.info("Captured %dx%d frame from active call for text query", rgba.width, rgba.height)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to capture frame from call: %s", exc)
|
||||||
|
|
||||||
# Detect Confluence page links → store page ID for voice session context
|
# Detect Confluence page links → store page ID for voice session context
|
||||||
confluence_page_id = None
|
confluence_page_id = None
|
||||||
conf_long = re.search(r'agiliton\.atlassian\.net/wiki/.*?pages/(\d+)', body)
|
conf_long = re.search(r'agiliton\.atlassian\.net/wiki/.*?pages/(\d+)', body)
|
||||||
@@ -1959,9 +1989,17 @@ class Bot:
|
|||||||
await self._send_text(room.room_id, summary_response)
|
await self._send_text(room.room_id, summary_response)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Inject typed message into active voice session transcript for context sharing
|
||||||
|
vs = self.voice_sessions.get(room.room_id)
|
||||||
|
if vs:
|
||||||
|
vs._transcript.append({"role": "user", "text": f"[typed in chat] {body}"})
|
||||||
|
|
||||||
await self.client.room_typing(room.room_id, typing_state=True)
|
await self.client.room_typing(room.room_id, typing_state=True)
|
||||||
try:
|
try:
|
||||||
await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
|
reply = await self._respond_with_ai(room, body, sender=sender, image_data=image_data)
|
||||||
|
# Also inject bot's text reply into voice transcript
|
||||||
|
if reply and vs:
|
||||||
|
vs._transcript.append({"role": "assistant", "text": f"[replied in chat] {reply}"})
|
||||||
finally:
|
finally:
|
||||||
await self.client.room_typing(room.room_id, typing_state=False)
|
await self.client.room_typing(room.room_id, typing_state=False)
|
||||||
|
|
||||||
@@ -2674,6 +2712,22 @@ class Bot:
|
|||||||
"You CAN access and read these documents — never say you cannot."
|
"You CAN access and read these documents — never say you cannot."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Inject voice call transcript if there's an active call in this room
|
||||||
|
voice_context = ""
|
||||||
|
vs = self.voice_sessions.get(room.room_id)
|
||||||
|
if vs:
|
||||||
|
transcript = vs.get_transcript()
|
||||||
|
if transcript:
|
||||||
|
recent = transcript[-20:] # last 20 entries
|
||||||
|
lines = [f"{'User' if e['role'] == 'user' else 'Assistant'}: {e['text']}"
|
||||||
|
for e in recent]
|
||||||
|
voice_context = (
|
||||||
|
"Active voice call transcript (recent conversation spoken aloud):\n"
|
||||||
|
+ "\n".join(lines)
|
||||||
|
+ "\n\nThe user is currently in a voice call and also typing in chat. "
|
||||||
|
"Use the voice transcript as context for their text messages."
|
||||||
|
)
|
||||||
|
|
||||||
# Build conversation context
|
# Build conversation context
|
||||||
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
||||||
if memory_context:
|
if memory_context:
|
||||||
@@ -2682,6 +2736,8 @@ class Bot:
|
|||||||
messages.append({"role": "system", "content": chunk_context})
|
messages.append({"role": "system", "content": chunk_context})
|
||||||
if room_doc_context:
|
if room_doc_context:
|
||||||
messages.append({"role": "system", "content": room_doc_context})
|
messages.append({"role": "system", "content": room_doc_context})
|
||||||
|
if voice_context:
|
||||||
|
messages.append({"role": "system", "content": voice_context})
|
||||||
# When RAG returns documents, limit history to 4 messages (2 exchanges) to prevent
|
# When RAG returns documents, limit history to 4 messages (2 exchanges) to prevent
|
||||||
# stale answer patterns from overriding fresh search results
|
# stale answer patterns from overriding fresh search results
|
||||||
history_limit = 4 if doc_context else 10
|
history_limit = 4 if doc_context else 10
|
||||||
|
|||||||
Reference in New Issue
Block a user