From 3bf9229ae47b5497a0de30acf6fc40cf346a4f25 Mon Sep 17 00:00:00 2001
From: Christian Gick <service@agiliton.eu>
Date: Fri, 27 Feb 2026 07:58:11 +0200
Subject: [PATCH] fix(MAT-56): Prevent bot silence from STT noise leak + LLM
 timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes for the bot going silent after ~10 messages:

1. STT artifact handler now returns early — previously detected noise
   leaks ("Vielen Dank.", etc.) but still appended them to transcript,
   inflating context until LLM timed out after 4 retries.

2. Context truncation — caps LLM chat context at 40 items and internal
   transcript at 80 entries to prevent unbounded growth in long sessions.

3. LLM timeout recovery — watchdog detects when agent has been silent
   for >60s despite user activity, sends a recovery reply asking user
   to repeat their question instead of staying permanently silent.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 voice.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/voice.py b/voice.py
index 6a6c561..6f94414 100644
--- a/voice.py
+++ b/voice.py
@@ -108,11 +108,15 @@ def _is_stt_artifact(text: str) -> bool:
     return False
 
 
+_MAX_CHAT_CTX_ITEMS = 40  # Keep last N items in LLM context to prevent unbounded growth
+
+
 class _NoiseFilterAgent(Agent):
     """Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
 
     Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
     risk to the audio pipeline. Raises StopResponse to silently discard noise.
+    Also truncates chat context to prevent unbounded growth in long sessions.
     """
 
     async def on_user_turn_completed(
@@ -126,6 +130,15 @@ class _NoiseFilterAgent(Agent):
                 turn_ctx.items.pop()
             raise StopResponse()
 
+        # Truncate context: keep system/instructions + last N items
+        if len(turn_ctx.items) > _MAX_CHAT_CTX_ITEMS + 5:
+            # Preserve first item (system prompt) and trim middle
+            keep_start = 1  # system prompt
+            keep_end = _MAX_CHAT_CTX_ITEMS
+            old_len = len(turn_ctx.items)
+            turn_ctx.items[:] = turn_ctx.items[:keep_start] + turn_ctx.items[-keep_end:]
+            logger.info("CONTEXT_TRUNCATED: %d -> %d items", old_len, len(turn_ctx.items))
+
 
 _vad = None
 def _get_vad():
@@ -756,11 +769,15 @@ class VoiceSession:
                 text = ev.transcript or ""
                 if text and _is_stt_artifact(text):
                     logger.warning("NOISE_LEAK: artifact reached STT: %s", text)
-                else:
+                    return  # Do NOT add artifacts to transcript — they inflate context
+                if text:
                     logger.info("USER_SPEECH: %s", text)
                 if ev.transcript:
                     _last_user_speech.append(ev.transcript)
                     self._transcript.append({"role": "user", "text": ev.transcript})
+                    # Cap transcript to prevent unbounded memory growth
+                    if len(self._transcript) > _MAX_CHAT_CTX_ITEMS * 2:
+                        self._transcript[:] = self._transcript[-_MAX_CHAT_CTX_ITEMS:]
 
             @self.session.on("conversation_item_added")
             def _on_conversation_item(ev):
@@ -1047,13 +1064,43 @@ class VoiceSession:
             except asyncio.TimeoutError:
                 logger.error("Greeting timed out")
 
-            # VAD watchdog: log diagnostic and attempt E2EE key recovery if stuck
+            # VAD watchdog: log diagnostic, attempt E2EE key recovery, and recover from LLM failures
             import time as _time
+            _last_agent_speech_time = _time.monotonic()
+            _llm_recovery_attempted = False
+
+            @self.session.on("conversation_item_added")
+            def _track_agent_speech(ev):
+                nonlocal _last_agent_speech_time, _llm_recovery_attempted
+                role = getattr(ev.item, "role", "?")
+                if role == "assistant":
+                    _last_agent_speech_time = _time.monotonic()
+                    _llm_recovery_attempted = False  # reset on successful speech
+
             while True:
                 await asyncio.sleep(10)
+
+                # LLM timeout recovery: if user has been speaking but agent
+                # hasn't responded in >60s, the LLM pipeline is likely stuck
+                sc = _vad_state_log.get("speaking_count", 0)
+                agent_silent_secs = _time.monotonic() - _last_agent_speech_time
+                if sc > 0 and agent_silent_secs > 60 and not _llm_recovery_attempted:
+                    _llm_recovery_attempted = True
+                    logger.warning(
+                        "LLM_RECOVERY: agent silent for %.0fs after %d user turns "
+                        "— sending recovery reply", agent_silent_secs, sc)
+                    try:
+                        await asyncio.wait_for(
+                            self.session.generate_reply(
+                                instructions="Entschuldigung, ich hatte kurz ein technisches Problem. "
+                                "Kannst du deine letzte Frage bitte wiederholen?"),
+                            timeout=30.0)
+                        logger.info("LLM_RECOVERY: recovery reply sent successfully")
+                    except Exception as exc:
+                        logger.error("LLM_RECOVERY: recovery reply failed: %s", exc)
+
                 away_since = _vad_state_log.get("away_since")
                 if away_since and (_time.monotonic() - away_since) > 30:
-                    sc = _vad_state_log.get("speaking_count", 0)
                     e2ee_ok = any(
                         str(getattr(p, '_e2ee_state', '')) == 'OK'
                         for p in self.lk_room.remote_participants.values()