diff --git a/voice.py b/voice.py index 4f6c345..5d4863e 100644 --- a/voice.py +++ b/voice.py @@ -16,7 +16,7 @@ import re import aiohttp import httpx from livekit import rtc, api as lkapi -from livekit.agents import Agent, AgentSession, function_tool, room_io +from livekit.agents import Agent, AgentSession, StopResponse, function_tool, room_io, llm from livekit.plugins import openai as lk_openai, elevenlabs, silero from openai import AsyncOpenAI @@ -58,27 +58,27 @@ def _build_voice_prompt() -> str: ) # ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc. -# Filter these out so the LLM never sees them. -_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$') +# Filter these via on_user_turn_completed (downstream of VAD+STT, no pipeline impact). +_NOISE_ANNOTATION_RE = re.compile(r'^\s*\*[^*]+\*\s*$') -class _VoiceAgent(Agent): - async def stt_node(self, audio, model_settings): - from livekit.agents import stt as lk_stt - result = Agent.default.stt_node(self, audio, model_settings) - if asyncio.iscoroutine(result): - result = await result - if result is None: - return - async for event in result: - if isinstance(event, lk_stt.SpeechEvent): - alts = getattr(event, 'alternatives', None) - if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()): - logger.debug("STT noise filtered: %s", alts[0].text) - continue - elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()): - logger.debug("STT noise filtered: %s", event) - continue - yield event + +class _NoiseFilterAgent(Agent): + """Agent that suppresses ElevenLabs noise annotations before LLM sees them. + + Uses on_user_turn_completed() which runs after VAD+STT, so no backpressure + risk to the audio pipeline. Raises StopResponse to silently discard noise. + """ + + async def on_user_turn_completed( + self, turn_ctx: llm.ChatContext, new_message: llm.ChatMessage + ) -> None: + text = (new_message.text_content or "").strip() + if text and _NOISE_ANNOTATION_RE.match(text): + logger.info("Noise annotation suppressed: %s", text) + # Remove the noise message from context so it doesn't accumulate + if turn_ctx.items and turn_ctx.items[-1] is new_message: + turn_ctx.items.pop() + raise StopResponse() _vad = None @@ -644,7 +644,7 @@ class VoiceSession: logger.info("SEARCH_RESULT: %s", result[:200]) return result - agent = Agent( + agent = _NoiseFilterAgent( instructions=_build_voice_prompt() + memory_section, tools=[search_web], )