fix(noise): filter STT noise annotations via on_user_turn_completed
Replace broken _VoiceAgent stt_node override with _NoiseFilterAgent that uses on_user_turn_completed() + StopResponse. This operates downstream of VAD+STT so no backpressure risk to the audio pipeline. When ElevenLabs scribe_v2_realtime produces *Störgeräusche* etc., the agent now silently suppresses them before the LLM responds. The prompt-based filter is kept as defense-in-depth. Fixes: MAT-41 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
44
voice.py
44
voice.py
@@ -16,7 +16,7 @@ import re
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import httpx
|
import httpx
|
||||||
from livekit import rtc, api as lkapi
|
from livekit import rtc, api as lkapi
|
||||||
from livekit.agents import Agent, AgentSession, function_tool, room_io
|
from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm
|
||||||
from livekit.plugins import openai as lk_openai, elevenlabs, silero
|
from livekit.plugins import openai as lk_openai, elevenlabs, silero
|
||||||
from openai import AsyncOpenAI
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
@@ -58,27 +58,27 @@ def _build_voice_prompt() -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
|
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
|
||||||
# Filter these out so the LLM never sees them.
|
# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact).
|
||||||
_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
|
_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$')
|
||||||
|
|
||||||
class _VoiceAgent(Agent):
|
|
||||||
async def stt_node(self, audio, model_settings):
|
class _NoiseFilterAgent(Agent):
|
||||||
from livekit.agents import stt as lk_stt
|
"""Agent that suppresses ElevenLabs noise annotations before LLM sees them.
|
||||||
result = Agent.default.stt_node(self, audio, model_settings)
|
|
||||||
if asyncio.iscoroutine(result):
|
Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure
|
||||||
result = await result
|
risk to the audio pipeline. Raises StopResponse to silently discard noise.
|
||||||
if result is None:
|
"""
|
||||||
return
|
|
||||||
async for event in result:
|
async def on_user_turn_completed(
|
||||||
if isinstance(event, lk_stt.SpeechEvent):
|
self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage
|
||||||
alts = getattr(event, 'alternatives', None)
|
) -> None:
|
||||||
if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
|
text = (new_message.text_content or "").strip()
|
||||||
logger.debug("STT noise filtered: %s", alts[0].text)
|
if text and _NOISE_ANNOTATION_RE.match(text):
|
||||||
continue
|
logger.info("Noise annotation suppressed: %s", text)
|
||||||
elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
|
# Remove the noise message from context so it doesn't accumulate
|
||||||
logger.debug("STT noise filtered: %s", event)
|
if turn_ctx.items and turn_ctx.items[-1] is new_message:
|
||||||
continue
|
turn_ctx.items.pop()
|
||||||
yield event
|
raise StopResponse()
|
||||||
|
|
||||||
|
|
||||||
_vad = None
|
_vad = None
|
||||||
@@ -644,7 +644,7 @@ class VoiceSession:
|
|||||||
logger.info("SEARCH_RESULT: %s", result[:200])
|
logger.info("SEARCH_RESULT: %s", result[:200])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
agent = Agent(
|
agent = _NoiseFilterAgent(
|
||||||
instructions=_build_voice_prompt() + memory_section,
|
instructions=_build_voice_prompt() + memory_section,
|
||||||
tools=[search_web],
|
tools=[search_web],
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user