diff --git a/voice.py b/voice.py
index 3223868..2592da5 100644
--- a/voice.py
+++ b/voice.py
@@ -1136,6 +1136,7 @@ class VoiceSession:
             # Vision tool — capture video frame and analyze with vision model
             _video_track_ref = self  # reference to VoiceSession for video track access
             _lk_room_ref = self.lk_room
+            _session_ref = self.session  # for say() in tools
 
             @function_tool
             async def look_at_screen(question: str) -> str:
@@ -1151,6 +1152,12 @@ class VoiceSession:
                 if not video_track:
                     return ("Kein Video verfuegbar. Der Nutzer muss seine Kamera oder "
                             "Bildschirmfreigabe aktivieren bevor ich etwas sehen kann.")
+                # Instant filler so user knows bot is looking
+                try:
+                    await _session_ref.say("Einen Moment, ich schaue mir das an.",
+                                           allow_interruptions=True, add_to_chat_ctx=False)
+                except Exception:
+                    pass
                 try:
                     # Capture single frame from video track
                     stream = rtc.VideoStream(video_track)