diff --git a/voice.py b/voice.py index 451d37e..b8ddfc3 100644 --- a/voice.py +++ b/voice.py @@ -55,6 +55,30 @@ def _build_voice_prompt() -> str: datetime=now.strftime("%A, %d. %B %Y %H:%M %Z") ) +# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc. +# Filter these out so the LLM never sees them. +_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$') + +class _VoiceAgent(Agent): + async def stt_node(self, audio, model_settings): + from livekit.agents import stt as lk_stt + result = Agent.default.stt_node(self, audio, model_settings) + if asyncio.iscoroutine(result): + result = await result + if result is None: + return + async for event in result: + if isinstance(event, lk_stt.SpeechEvent): + alts = getattr(event, 'alternatives', None) + if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()): + logger.debug("STT noise filtered: %s", alts[0].text) + continue + elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()): + logger.debug("STT noise filtered: %s", event) + continue + yield event + + _vad = None def _get_vad(): global _vad @@ -611,7 +635,7 @@ class VoiceSession: logger.info("SEARCH_RESULT: %s", result[:200]) return result - agent = Agent( + agent = _VoiceAgent( instructions=_build_voice_prompt() + memory_section, tools=[search_web], )