fix(stt): filter ElevenLabs noise annotations before LLM

scribe_v2_realtime annotates background audio as *Störgeräusche*,
*Fernsehgeräusche* etc. Override stt_node to drop these so the LLM
only receives actual speech transcripts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Christian Gick
2026-02-22 17:59:17 +02:00
parent 02a7c91eaf
commit 1e1911995f

View File

@@ -55,6 +55,30 @@ def _build_voice_prompt() -> str:
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
)
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
# Filter these out so the LLM never sees them.
_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
class _VoiceAgent(Agent):
async def stt_node(self, audio, model_settings):
from livekit.agents import stt as lk_stt
result = Agent.default.stt_node(self, audio, model_settings)
if asyncio.iscoroutine(result):
result = await result
if result is None:
return
async for event in result:
if isinstance(event, lk_stt.SpeechEvent):
alts = getattr(event, 'alternatives', None)
if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
logger.debug("STT noise filtered: %s", alts[0].text)
continue
elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
logger.debug("STT noise filtered: %s", event)
continue
yield event
_vad = None
def _get_vad():
global _vad
@@ -611,7 +635,7 @@ class VoiceSession:
logger.info("SEARCH_RESULT: %s", result[:200])
return result
agent = Agent(
agent = _VoiceAgent(
instructions=_build_voice_prompt() + memory_section,
tools=[search_web],
)