fix(stt): filter ElevenLabs noise annotations before LLM

scribe_v2_realtime annotates background audio as *Störgeräusche*, *Fernsehgeräusche* etc. Override stt_node to drop these so the LLM only receives actual speech transcripts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 17:59:17 +02:00
parent 02a7c91eaf
commit 1e1911995f
1 changed files with 25 additions and 1 deletions
--- a/voice.py
+++ b/voice.py
@@ -55,6 +55,30 @@ def _build_voice_prompt() -> str:
        datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
    )

+# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
+# Filter these out so the LLM never sees them.
+_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
+
+class _VoiceAgent(Agent):
+    async def stt_node(self, audio, model_settings):
+        from livekit.agents import stt as lk_stt
+        result = Agent.default.stt_node(self, audio, model_settings)
+        if asyncio.iscoroutine(result):
+            result = await result
+        if result is None:
+            return
+        async for event in result:
+            if isinstance(event, lk_stt.SpeechEvent):
+                alts = getattr(event, 'alternatives', None)
+                if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
+                    logger.debug("STT noise filtered: %s", alts[0].text)
+                    continue
+            elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
+                logger.debug("STT noise filtered: %s", event)
+                continue
+            yield event
+
+
 _vad = None
 def _get_vad():
    global _vad
@@ -611,7 +635,7 @@ class VoiceSession:
                logger.info("SEARCH_RESULT: %s", result[:200])
                return result

-            agent = Agent(
+            agent = _VoiceAgent(
                instructions=_build_voice_prompt() + memory_section,
                tools=[search_web],
            )