fix(noise): filter STT noise annotations via on_user_turn_completed

Replace broken _VoiceAgent stt_node override with _NoiseFilterAgent that uses
on_user_turn_completed() + StopResponse. This operates downstream of VAD+STT
so no backpressure risk to the audio pipeline.

When ElevenLabs scribe_v2_realtime produces *Störgeräusche* etc., the agent
now silently suppresses them before the LLM responds. The prompt-based filter
is kept as defense-in-depth.

Fixes: MAT-41

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-22 19:07:31 +02:00
parent 7f03cc1f37
commit fa9e95b250

View File

@@ -16,7 +16,7 @@ import re
import aiohttp import aiohttp
import httpx import httpx
from livekit import rtc, api as lkapi from livekit import rtc, api as lkapi
from livekit.agents import Agent, AgentSession, function_tool, room_io from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm
from livekit.plugins import openai as lk_openai, elevenlabs, silero from livekit.plugins import openai as lk_openai, elevenlabs, silero
from openai import AsyncOpenAI from openai import AsyncOpenAI
@@ -58,27 +58,27 @@ def _build_voice_prompt() -> str:
) )
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc. # ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
# Filter these out so the LLM never sees them. # Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$') _NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
class _VoiceAgent(Agent):
async def stt_node(self, audio, model_settings): class _NoiseFilterAgent(Agent):
from livekit.agents import stt as lk_stt """Agent that suppresses ElevenLabs noise annotations before LLM sees them.
result = Agent.default.stt_node(self, audio, model_settings)
if asyncio.iscoroutine(result): Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
result = await result risk to the audio pipeline. Raises StopResponse to silently discard noise.
if result is None: """
return
async for event in result: async def on_user_turn_completed(
if isinstance(event, lk_stt.SpeechEvent): self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
alts = getattr(event, 'alternatives', None) ) -> None:
if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()): text = (new_message.text_content or "").strip()
logger.debug("STT noise filtered: %s", alts[0].text) if text and _NOISE_ANNOTATION_RE.match(text):
continue logger.info("Noise annotation suppressed: %s", text)
elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()): # Remove the noise message from context so it doesn't accumulate
logger.debug("STT noise filtered: %s", event) if turn_ctx.items and turn_ctx.items[-1] is new_message:
continue turn_ctx.items.pop()
yield event raise StopResponse()
_vad = None _vad = None
@@ -644,7 +644,7 @@ class VoiceSession:
logger.info("SEARCH_RESULT: %s", result[:200]) logger.info("SEARCH_RESULT: %s", result[:200])
return result return result
agent = Agent( agent = _NoiseFilterAgent(
instructions=_build_voice_prompt() + memory_section, instructions=_build_voice_prompt() + memory_section,
tools=[search_web], tools=[search_web],
) )