fix(stt): filter ElevenLabs noise annotations before LLM

scribe_v2_realtime annotates background audio as *Störgeräusche*, *Fernsehgeräusche* etc. Override stt_node to drop these so the LLM only receives actual speech transcripts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 17:59:17 +02:00
parent 02a7c91eaf
commit 1e1911995f
1 changed files with 25 additions and 1 deletions
--- a/voice.py
+++ b/voice.py
@@ -55,6 +55,30 @@ def _build_voice_prompt() -> str:
        datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
    )
 # ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
 # Filter these out so the LLM never sees them.
 _NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
 class _VoiceAgent(Agent):
    async def stt_node(self, audio, model_settings):
        from livekit.agents import stt as lk_stt
        result = Agent.default.stt_node(self, audio, model_settings)
        if asyncio.iscoroutine(result):
            result = await result
        if result is None:
            return
        async for event in result:
            if isinstance(event, lk_stt.SpeechEvent):
                alts = getattr(event, 'alternatives', None)
                if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
                    logger.debug("STT noise filtered: %s", alts[0].text)
                    continue
            elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
                logger.debug("STT noise filtered: %s", event)
                continue
            yield event
 _vad = None
 def _get_vad():
    global _vad
@@ -611,7 +635,7 @@ class VoiceSession:
                logger.info("SEARCH_RESULT: %s", result[:200])
                return result
-            agent = Agent(
+            agent = _VoiceAgent(
                instructions=_build_voice_prompt() + memory_section,
                tools=[search_web],
            )