From 3bf9229ae47b5497a0de30acf6fc40cf346a4f25 Mon Sep 17 00:00:00 2001 From: Christian Gick Date: Fri, 27 Feb 2026 07:58:11 +0200 Subject: [PATCH] fix(MAT-56): Prevent bot silence from STT noise leak + LLM timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for the bot going silent after ~10 messages: 1. STT artifact handler now returns early — previously detected noise leaks ("Vielen Dank.", etc.) but still appended them to transcript, inflating context until LLM timed out after 4 retries. 2. Context truncation — caps LLM chat context at 40 items and internal transcript at 80 entries to prevent unbounded growth in long sessions. 3. LLM timeout recovery — watchdog detects when agent has been silent for >60s despite user activity, sends a recovery reply asking user to repeat their question instead of staying permanently silent. Co-Authored-By: Claude Opus 4.6 --- voice.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/voice.py b/voice.py index 6a6c561..6f94414 100644 --- a/voice.py +++ b/voice.py @@ -108,11 +108,15 @@ def _is_stt_artifact(text: str) -> bool: return False +_MAX_CHAT_CTX_ITEMS = 40 # Keep last N items in LLM context to prevent unbounded growth + + class _NoiseFilterAgent(Agent): """Agent that suppresses ElevenLabs STT artifacts before LLM sees them. Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure risk to the audio pipeline. Raises StopResponse to silently discard noise. + Also truncates chat context to prevent unbounded growth in long sessions. """ async def on_user_turn_completed( @@ -126,6 +130,15 @@ class _NoiseFilterAgent(Agent): turn_ctx.items.pop() raise StopResponse() + # Truncate context: keep system/instructions + last N items + if len(turn_ctx.items) > _MAX_CHAT_CTX_ITEMS + 5: + # Preserve first item (system prompt) and trim middle + keep_start = 1 # system prompt + keep_end = _MAX_CHAT_CTX_ITEMS + old_len = len(turn_ctx.items) + turn_ctx.items[:] = turn_ctx.items[:keep_start] + turn_ctx.items[-keep_end:] + logger.info("CONTEXT_TRUNCATED: %d -> %d items", old_len, len(turn_ctx.items)) + _vad = None def _get_vad(): @@ -756,11 +769,15 @@ class VoiceSession: text = ev.transcript or "" if text and _is_stt_artifact(text): logger.warning("NOISE_LEAK: artifact reached STT: %s", text) - else: + return # Do NOT add artifacts to transcript — they inflate context + if text: logger.info("USER_SPEECH: %s", text) if ev.transcript: _last_user_speech.append(ev.transcript) self._transcript.append({"role": "user", "text": ev.transcript}) + # Cap transcript to prevent unbounded memory growth + if len(self._transcript) > _MAX_CHAT_CTX_ITEMS * 2: + self._transcript[:] = self._transcript[-_MAX_CHAT_CTX_ITEMS:] @self.session.on("conversation_item_added") def _on_conversation_item(ev): @@ -1047,13 +1064,43 @@ class VoiceSession: except asyncio.TimeoutError: logger.error("Greeting timed out") - # VAD watchdog: log diagnostic and attempt E2EE key recovery if stuck + # VAD watchdog: log diagnostic, attempt E2EE key recovery, and recover from LLM failures import time as _time + _last_agent_speech_time = _time.monotonic() + _llm_recovery_attempted = False + + @self.session.on("conversation_item_added") + def _track_agent_speech(ev): + nonlocal _last_agent_speech_time, _llm_recovery_attempted + role = getattr(ev.item, "role", "?") + if role == "assistant": + _last_agent_speech_time = _time.monotonic() + _llm_recovery_attempted = False # reset on successful speech + while True: await asyncio.sleep(10) + + # LLM timeout recovery: if user has been speaking but agent + # hasn't responded in >60s, the LLM pipeline is likely stuck + sc = _vad_state_log.get("speaking_count", 0) + agent_silent_secs = _time.monotonic() - _last_agent_speech_time + if sc > 0 and agent_silent_secs > 60 and not _llm_recovery_attempted: + _llm_recovery_attempted = True + logger.warning( + "LLM_RECOVERY: agent silent for %.0fs after %d user turns " + "— sending recovery reply", agent_silent_secs, sc) + try: + await asyncio.wait_for( + self.session.generate_reply( + instructions="Entschuldigung, ich hatte kurz ein technisches Problem. " + "Kannst du deine letzte Frage bitte wiederholen?"), + timeout=30.0) + logger.info("LLM_RECOVERY: recovery reply sent successfully") + except Exception as exc: + logger.error("LLM_RECOVERY: recovery reply failed: %s", exc) + away_since = _vad_state_log.get("away_since") if away_since and (_time.monotonic() - away_since) > 30: - sc = _vad_state_log.get("speaking_count", 0) e2ee_ok = any( str(getattr(p, '_e2ee_state', '')) == 'OK' for p in self.lk_room.remote_participants.values()