fix(MAT-56): Prevent bot silence from STT noise leak + LLM timeout
Three fixes for the bot going silent after ~10 messages:
1. STT artifact handler now returns early — previously detected noise
leaks ("Vielen Dank.", etc.) but still appended them to transcript,
inflating context until LLM timed out after 4 retries.
2. Context truncation — caps LLM chat context at 40 items and internal
transcript at 80 entries to prevent unbounded growth in long sessions.
3. LLM timeout recovery — watchdog detects when agent has been silent
for >60s despite user activity, sends a recovery reply asking user
to repeat their question instead of staying permanently silent.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
53
voice.py
53
voice.py
@@ -108,11 +108,15 @@ def _is_stt_artifact(text: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_CHAT_CTX_ITEMS = 40 # Keep last N items in LLM context to prevent unbounded growth
|
||||||
|
|
||||||
|
|
||||||
class _NoiseFilterAgent(Agent):
|
class _NoiseFilterAgent(Agent):
|
||||||
"""Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
|
"""Agent that suppresses ElevenLabs STT artifacts before LLM sees them.
|
||||||
|
|
||||||
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
||||||
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
||||||
|
Also truncates chat context to prevent unbounded growth in long sessions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
async def on_user_turn_completed(
|
async def on_user_turn_completed(
|
||||||
@@ -126,6 +130,15 @@ class _NoiseFilterAgent(Agent):
|
|||||||
turn_ctx.items.pop()
|
turn_ctx.items.pop()
|
||||||
raise StopResponse()
|
raise StopResponse()
|
||||||
|
|
||||||
|
# Truncate context: keep system/instructions + last N items
|
||||||
|
if len(turn_ctx.items) > _MAX_CHAT_CTX_ITEMS + 5:
|
||||||
|
# Preserve first item (system prompt) and trim middle
|
||||||
|
keep_start = 1 # system prompt
|
||||||
|
keep_end = _MAX_CHAT_CTX_ITEMS
|
||||||
|
old_len = len(turn_ctx.items)
|
||||||
|
turn_ctx.items[:] = turn_ctx.items[:keep_start] + turn_ctx.items[-keep_end:]
|
||||||
|
logger.info("CONTEXT_TRUNCATED: %d -> %d items", old_len, len(turn_ctx.items))
|
||||||
|
|
||||||
|
|
||||||
_vad = None
|
_vad = None
|
||||||
def _get_vad():
|
def _get_vad():
|
||||||
@@ -756,11 +769,15 @@ class VoiceSession:
|
|||||||
text = ev.transcript or ""
|
text = ev.transcript or ""
|
||||||
if text and _is_stt_artifact(text):
|
if text and _is_stt_artifact(text):
|
||||||
logger.warning("NOISE_LEAK: artifact reached STT: %s", text)
|
logger.warning("NOISE_LEAK: artifact reached STT: %s", text)
|
||||||
else:
|
return # Do NOT add artifacts to transcript — they inflate context
|
||||||
|
if text:
|
||||||
logger.info("USER_SPEECH: %s", text)
|
logger.info("USER_SPEECH: %s", text)
|
||||||
if ev.transcript:
|
if ev.transcript:
|
||||||
_last_user_speech.append(ev.transcript)
|
_last_user_speech.append(ev.transcript)
|
||||||
self._transcript.append({"role": "user", "text": ev.transcript})
|
self._transcript.append({"role": "user", "text": ev.transcript})
|
||||||
|
# Cap transcript to prevent unbounded memory growth
|
||||||
|
if len(self._transcript) > _MAX_CHAT_CTX_ITEMS * 2:
|
||||||
|
self._transcript[:] = self._transcript[-_MAX_CHAT_CTX_ITEMS:]
|
||||||
|
|
||||||
@self.session.on("conversation_item_added")
|
@self.session.on("conversation_item_added")
|
||||||
def _on_conversation_item(ev):
|
def _on_conversation_item(ev):
|
||||||
@@ -1047,13 +1064,43 @@ class VoiceSession:
|
|||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
logger.error("Greeting timed out")
|
logger.error("Greeting timed out")
|
||||||
|
|
||||||
# VAD watchdog: log diagnostic and attempt E2EE key recovery if stuck
|
# VAD watchdog: log diagnostic, attempt E2EE key recovery, and recover from LLM failures
|
||||||
import time as _time
|
import time as _time
|
||||||
|
_last_agent_speech_time = _time.monotonic()
|
||||||
|
_llm_recovery_attempted = False
|
||||||
|
|
||||||
|
@self.session.on("conversation_item_added")
|
||||||
|
def _track_agent_speech(ev):
|
||||||
|
nonlocal _last_agent_speech_time, _llm_recovery_attempted
|
||||||
|
role = getattr(ev.item, "role", "?")
|
||||||
|
if role == "assistant":
|
||||||
|
_last_agent_speech_time = _time.monotonic()
|
||||||
|
_llm_recovery_attempted = False # reset on successful speech
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
await asyncio.sleep(10)
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
# LLM timeout recovery: if user has been speaking but agent
|
||||||
|
# hasn't responded in >60s, the LLM pipeline is likely stuck
|
||||||
|
sc = _vad_state_log.get("speaking_count", 0)
|
||||||
|
agent_silent_secs = _time.monotonic() - _last_agent_speech_time
|
||||||
|
if sc > 0 and agent_silent_secs > 60 and not _llm_recovery_attempted:
|
||||||
|
_llm_recovery_attempted = True
|
||||||
|
logger.warning(
|
||||||
|
"LLM_RECOVERY: agent silent for %.0fs after %d user turns "
|
||||||
|
"— sending recovery reply", agent_silent_secs, sc)
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
self.session.generate_reply(
|
||||||
|
instructions="Entschuldigung, ich hatte kurz ein technisches Problem. "
|
||||||
|
"Kannst du deine letzte Frage bitte wiederholen?"),
|
||||||
|
timeout=30.0)
|
||||||
|
logger.info("LLM_RECOVERY: recovery reply sent successfully")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("LLM_RECOVERY: recovery reply failed: %s", exc)
|
||||||
|
|
||||||
away_since = _vad_state_log.get("away_since")
|
away_since = _vad_state_log.get("away_since")
|
||||||
if away_since and (_time.monotonic() - away_since) > 30:
|
if away_since and (_time.monotonic() - away_since) > 30:
|
||||||
sc = _vad_state_log.get("speaking_count", 0)
|
|
||||||
e2ee_ok = any(
|
e2ee_ok = any(
|
||||||
str(getattr(p, '_e2ee_state', '')) == 'OK'
|
str(getattr(p, '_e2ee_state', '')) == 'OK'
|
||||||
for p in self.lk_room.remote_participants.values()
|
for p in self.lk_room.remote_participants.values()
|
||||||
|
|||||||
Reference in New Issue
Block a user