fix(stt): filter ElevenLabs noise annotations before LLM
scribe_v2_realtime annotates background audio as *Störgeräusche*, *Fernsehgeräusche* etc. Override stt_node to drop these so the LLM only receives actual speech transcripts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
26
voice.py
26
voice.py
@@ -55,6 +55,30 @@ def _build_voice_prompt() -> str:
|
||||
datetime=now.strftime("%A, %d. %B %Y %H:%M %Z")
|
||||
)
|
||||
|
||||
# ElevenLabs scribe_v2_realtime annotates non-speech audio as *Geräusch* etc.
|
||||
# Filter these out so the LLM never sees them.
|
||||
_NOISE_ANNOTATION_RE = re.compile(r'^\*[^*]+\*$')
|
||||
|
||||
class _VoiceAgent(Agent):
|
||||
async def stt_node(self, audio, model_settings):
|
||||
from livekit.agents import stt as lk_stt
|
||||
result = Agent.default.stt_node(self, audio, model_settings)
|
||||
if asyncio.iscoroutine(result):
|
||||
result = await result
|
||||
if result is None:
|
||||
return
|
||||
async for event in result:
|
||||
if isinstance(event, lk_stt.SpeechEvent):
|
||||
alts = getattr(event, 'alternatives', None)
|
||||
if alts and _NOISE_ANNOTATION_RE.match(alts[0].text.strip()):
|
||||
logger.debug("STT noise filtered: %s", alts[0].text)
|
||||
continue
|
||||
elif isinstance(event, str) and _NOISE_ANNOTATION_RE.match(event.strip()):
|
||||
logger.debug("STT noise filtered: %s", event)
|
||||
continue
|
||||
yield event
|
||||
|
||||
|
||||
_vad = None
|
||||
def _get_vad():
|
||||
global _vad
|
||||
@@ -611,7 +635,7 @@ class VoiceSession:
|
||||
logger.info("SEARCH_RESULT: %s", result[:200])
|
||||
return result
|
||||
|
||||
agent = Agent(
|
||||
agent = _VoiceAgent(
|
||||
instructions=_build_voice_prompt() + memory_section,
|
||||
tools=[search_web],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user