fix(noise): filter STT noise annotations via on_user_turn_completed

Replace broken _VoiceAgent stt_node override with _NoiseFilterAgent that uses on_user_turn_completed() + StopResponse. This operates downstream of VAD+STT so no backpressure risk to the audio pipeline. When ElevenLabs scribe_v2_realtime produces *Störgeräusche* etc., the agent now silently suppresses them before the LLM responds. The prompt-based filter is kept as defense-in-depth. Fixes: MAT-41 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 19:07:31 +02:00
parent 7f03cc1f37
commit fa9e95b250
1 changed files with 22 additions and 22 deletions
--- a/voice.py
+++ b/voice.py
@@ -16,7 +16,7 @@ import re
 import aiohttp
 import httpx
 from livekit import rtc, api as lkapi
-from livekit.agents import Agent, AgentSession, function_tool, room_io
+from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm
 from livekit.plugins import openai as lk_openai, elevenlabs, silero
 from openai import AsyncOpenAI
@@ -58,27 +58,27 @@ def _build_voice_prompt() -> str:
    )
 # ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
-# Filter these out so the LLM never sees them.
+# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
-_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
+_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
-class _VoiceAgent(Agent):
+
-    async def stt_node(self, audio, model_settings):
+class _NoiseFilterAgent(Agent):
-        from livekit.agents import stt as lk_stt
+    """Agent that suppresses ElevenLabs noise annotations before LLM sees them.
-        result = Agent.default.stt_node(self, audio, model_settings)
+
-        if asyncio.iscoroutine(result):
+    Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
-            result = await result
+    risk to the audio pipeline. Raises StopResponse to silently discard noise.
-        if result is None:
+    """
-            return
+
-        async for event in result:
+    async def on_user_turn_completed(
-            if isinstance(event, lk_stt.SpeechEvent):
+        self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
-                alts = getattr(event, 'alternatives', None)
+    ) -> None:
-                if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
+        text = (new_message.text_content or "").strip()
-                    logger.debug("STT noise filtered: %s", alts[0].text)
+        if text and _NOISE_ANNOTATION_RE.match(text):
-                    continue
+            logger.info("Noise annotation suppressed: %s", text)
-            elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
+            # Remove the noise message from context so it doesn't accumulate
-                logger.debug("STT noise filtered: %s", event)
+            if turn_ctx.items and turn_ctx.items[-1] is new_message:
-                continue
+                turn_ctx.items.pop()
-            yield event
+            raise StopResponse()
 _vad = None
@@ -644,7 +644,7 @@ class VoiceSession:
                logger.info("SEARCH_RESULT: %s", result[:200])
                return result
-            agent = Agent(
+            agent = _NoiseFilterAgent(
                instructions=_build_voice_prompt() + memory_section,
                tools=[search_web],
            )