diff --git a/voice.py b/voice.py index 6a6c561..6f94414 100644 --- a/voice.py +++ b/voice.py @@ -108,11 +108,15 @@ def _is_stt_artifact(text: str) -> bool: return False +_MAX_CHAT_CTX_ITEMS = 40 # Keep last N items in LLM context to prevent unbounded growth + + class _NoiseFilterAgent(Agent): """Agent that suppresses ElevenLabs STT artifacts before LLM sees them. Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure risk to the audio pipeline. Raises StopResponse to silently discard noise. + Also truncates chat context to prevent unbounded growth in long sessions. """ async def on_user_turn_completed( @@ -126,6 +130,15 @@ class _NoiseFilterAgent(Agent): turn_ctx.items.pop() raise StopResponse() + # Truncate context: keep system/instructions + last N items + if len(turn_ctx.items) > _MAX_CHAT_CTX_ITEMS + 5: + # Preserve first item (system prompt) and trim middle + keep_start = 1 # system prompt + keep_end = _MAX_CHAT_CTX_ITEMS + old_len = len(turn_ctx.items) + turn_ctx.items[:] = turn_ctx.items[:keep_start] + turn_ctx.items[-keep_end:] + logger.info("CONTEXT_TRUNCATED: %d -> %d items", old_len, len(turn_ctx.items)) + _vad = None def _get_vad(): @@ -756,11 +769,15 @@ class VoiceSession: text = ev.transcript or "" if text and _is_stt_artifact(text): logger.warning("NOISE_LEAK: artifact reached STT: %s", text) - else: + return # Do NOT add artifacts to transcript — they inflate context + if text: logger.info("USER_SPEECH: %s", text) if ev.transcript: _last_user_speech.append(ev.transcript) self._transcript.append({"role": "user", "text": ev.transcript}) + # Cap transcript to prevent unbounded memory growth + if len(self._transcript) > _MAX_CHAT_CTX_ITEMS * 2: + self._transcript[:] = self._transcript[-_MAX_CHAT_CTX_ITEMS:] @self.session.on("conversation_item_added") def _on_conversation_item(ev): @@ -1047,13 +1064,43 @@ class VoiceSession: except asyncio.TimeoutError: logger.error("Greeting timed out") - # VAD watchdog: log diagnostic and attempt E2EE key recovery if stuck + # VAD watchdog: log diagnostic, attempt E2EE key recovery, and recover from LLM failures import time as _time + _last_agent_speech_time = _time.monotonic() + _llm_recovery_attempted = False + + @self.session.on("conversation_item_added") + def _track_agent_speech(ev): + nonlocal _last_agent_speech_time, _llm_recovery_attempted + role = getattr(ev.item, "role", "?") + if role == "assistant": + _last_agent_speech_time = _time.monotonic() + _llm_recovery_attempted = False # reset on successful speech + while True: await asyncio.sleep(10) + + # LLM timeout recovery: if user has been speaking but agent + # hasn't responded in >60s, the LLM pipeline is likely stuck + sc = _vad_state_log.get("speaking_count", 0) + agent_silent_secs = _time.monotonic() - _last_agent_speech_time + if sc > 0 and agent_silent_secs > 60 and not _llm_recovery_attempted: + _llm_recovery_attempted = True + logger.warning( + "LLM_RECOVERY: agent silent for %.0fs after %d user turns " + "— sending recovery reply", agent_silent_secs, sc) + try: + await asyncio.wait_for( + self.session.generate_reply( + instructions="Entschuldigung, ich hatte kurz ein technisches Problem. " + "Kannst du deine letzte Frage bitte wiederholen?"), + timeout=30.0) + logger.info("LLM_RECOVERY: recovery reply sent successfully") + except Exception as exc: + logger.error("LLM_RECOVERY: recovery reply failed: %s", exc) + away_since = _vad_state_log.get("away_since") if away_since and (_time.monotonic() - away_since) > 30: - sc = _vad_state_log.get("speaking_count", 0) e2ee_ok = any( str(getattr(p, '_e2ee_state', '')) == 'OK' for p in self.lk_room.remote_participants.values()