diff --git a/voice.py b/voice.py index 3223868..2592da5 100644 --- a/voice.py +++ b/voice.py @@ -1136,6 +1136,7 @@ class VoiceSession: # Vision tool — capture video frame and analyze with vision model _video_track_ref = self # reference to VoiceSession for video track access _lk_room_ref = self.lk_room + _session_ref = self.session # for say() in tools @function_tool async def look_at_screen(question: str) -> str: @@ -1151,6 +1152,12 @@ class VoiceSession: if not video_track: return ("Kein Video verfuegbar. Der Nutzer muss seine Kamera oder " "Bildschirmfreigabe aktivieren bevor ich etwas sehen kann.") + # Instant filler so user knows bot is looking + try: + await _session_ref.say("Einen Moment, ich schaue mir das an.", + allow_interruptions=True, add_to_chat_ctx=False) + except Exception: + pass try: # Capture single frame from video track stream = rtc.VideoStream(video_track)